Test Kpu Now

# Timed test start = time.time() for _ in range(100): c = torch.mm(a, b) torch.cuda.synchronize() elapsed = time.time() - start

device = torch.device("cuda") # Mixed precision to trigger tensor cores dtype = torch.float16 test kpu

# Warm-up for _ in range(5): c = torch.mm(a, b) torch.cuda.synchronize() # Timed test start = time