FWIW, this is the test driver I’m using to test the different fusers:
elif arg == '--fuser-nnc':
torch._C._jit_override_can_fuse_on_cpu(True)
torch._C._jit_override_can_fuse_on_gpu(True)
torch._C._jit_set_texpr_parallel_cpu_enabled(True)
torch._C._jit_set_te_must_use_llvm_cpu(False)
os.environ['PYTORCH_TENSOREXPR_DONT_USE_LLVM'] = '1'
elif arg == '--fuser-nnc-llvm':
torch._C._jit_override_can_fuse_on_cpu(True)
torch._C._jit_override_can_fuse_on_gpu(True)
torch._C._jit_set_texpr_parallel_cpu_enabled(True)
elif arg == '--nvfuser':
#os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1'
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False)
torch._C._jit_set_texpr_fuser_enabled(False)
torch._C._jit_set_nvfuser_enabled(True)
not seeing great results so far to be honest.