I am trying to compile a Roberta-large model for inf1 instance for batch size 4. I was successfully able to compile the model for batch size = 1, but getting the following error when trying to compile it for batch size greater than 1.
instance used for compilation : inf1.24xlarge
...............................................................................................................
Compiler status PASS
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_46053/3817965199.py in <module>
4 torch.cat([input_1['attention_mask']] * batch_size,0)
5 )
----> 6 model_neuron = torch.neuron.trace(model, neuron_inputs, compiler_args=['-O2'],strict=False)
7 #reranker.config.update({"traced_sequence_length": max_length})
8
~/pytorch_venv/lib/python3.7/site-packages/torch_neuron/convert.py in trace(func, example_inputs, fallback, op_whitelist, minimum_segment_size, subgraph_builder_function, subgraph_inputs_pruning, skip_compiler, debug_must_trace, allow_no_ops_on_neuron, compiler_workdir, dynamic_batch_size, compiler_timeout, single_fusion_ratio_threshold, _neuron_trace, compiler_args, optimizations, verbose, **kwargs)
192 with skip_inference_context() as s:
193 logger.debug("skip_inference_context - trace with fallback at {}".format(get_file_and_line()))
--> 194 neuron_graph = cu.compile_fused_operators(neuron_graph, **compile_kwargs)
195 cu.stats_post_compiler(neuron_graph)
196
~/pytorch_venv/lib/python3.7/site-packages/torch_neuron/convert.py in compile_fused_operators(self, neuron_graph, **kwargs)
407
408 # STEP 2: Invoke the graphs, passing an "op_converter" or compiler functor to be invoked on each subgraph - mutate the model
--> 409 neuron_graph(*neuron_graph.example_inputs, op_converter=op_converter)
410
411 # STEP 3: (re)Import the compiled sub-graphs as fused neuron operators
~/pytorch_venv/lib/python3.7/site-packages/torch_neuron/graph.py in __call__(self, op_converter, *inputs)
196 unique_tensor_map[unique] = execute.no_grad_tensor(tensor)
197 for op in self.operations:
--> 198 self.run_op(op, unique_tensor_map, op_converter)
199 if op.name in self._op_name_to_free_uniques:
200 for unique in self._op_name_to_free_uniques[op.name]:
~/pytorch_venv/lib/python3.7/site-packages/torch_neuron/graph.py in run_op(self, op, unique_tensor_map, op_converter)
207 if op_converter is not None:
208 op = op_converter(op, inputs)
--> 209 outputs = op(*inputs)
210 if len(op.output_uniques) == 1:
211 outputs = [outputs]
~/pytorch_venv/lib/python3.7/site-packages/torch_neuron/graph.py in __call__(self, *inputs)
340
341 def __call__(self, *inputs):
--> 342 return self.func(self, *inputs)
343
344 def uses(self, index):
~/pytorch_venv/lib/python3.7/site-packages/torch_neuron/resolve_function.py in func_from_schema(schema, op, *args)
69 if isinstance(overloads, tuple):
70 overloads, argument_names = overloads
---> 71 return overloads(**kwargs)
72
73
RuntimeError: The expanded size of the tensor (256) must match the existing size (4) at non-singleton dimension 1. Target sizes: [4, 256]. Tensor sizes: [4]
Below is the code which I am using for compiling the model
input_1 = tokenizer.encode_plus(text=input_sent_a,
return_tensors="pt",
padding='max_length',
truncation=True,
max_length=512)
batch_size = 4
neuron_inputs = (
torch.cat([input_1['input_ids']] * batch_size,0),
torch.cat([input_1['attention_mask']] * batch_size,0)
)
model_neuron = torch.neuron.trace(model, neuron_inputs, compiler_args=['-O2'],strict=False)
#reranker.config.update({"traced_sequence_length": max_length})
Any help would be appreciated.