Coverage for mlprodict/onnxrt/ops_cpu/op_dict_vectorizer.py: 100%

31 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-04 02:28 +0100

1# -*- encoding: utf-8 -*- 

2# pylint: disable=E0203,E1101,C0111 

3""" 

4@file 

5@brief Runtime operator. 

6""" 

7import numpy 

8from scipy.sparse import coo_matrix 

9from ._op import OpRun, RuntimeTypeError 

10 

11 

12class DictVectorizer(OpRun): 

13 

14 atts = {'int64_vocabulary': numpy.empty(0, dtype=numpy.int64), 

15 'string_vocabulary': numpy.empty(0, dtype=numpy.str_)} 

16 

17 def __init__(self, onnx_node, desc=None, **options): 

18 OpRun.__init__(self, onnx_node, desc=desc, 

19 expected_attributes=DictVectorizer.atts, 

20 **options) 

21 self.dict_labels = {} 

22 if len(self.int64_vocabulary) > 0: 

23 for i, v in enumerate(self.int64_vocabulary): 

24 self.dict_labels[v] = i 

25 self.is_int = True 

26 else: 

27 for i, v in enumerate(self.string_vocabulary): 

28 self.dict_labels[v.decode('utf-8')] = i 

29 self.is_int = False 

30 if len(self.dict_labels) == 0: 

31 raise RuntimeError( # pragma: no cover 

32 "int64_vocabulary and string_vocabulary cannot be both empty.") 

33 

34 def _run(self, x, attributes=None, verbose=0, fLOG=None): # pylint: disable=W0221 

35 if not isinstance(x, (numpy.ndarray, list)): 

36 raise RuntimeTypeError( # pragma: no cover 

37 f"x must be iterable not {type(x)}.") 

38 values = [] 

39 rows = [] 

40 cols = [] 

41 for i, row in enumerate(x): 

42 for k, v in row.items(): 

43 values.append(v) 

44 rows.append(i) 

45 cols.append(self.dict_labels[k]) 

46 values = numpy.array(values) 

47 rows = numpy.array(rows) 

48 cols = numpy.array(cols) 

49 return (coo_matrix((values, (rows, cols)), shape=(len(x), len(self.dict_labels))), )