Coverage for mlinsights/mlmodel/sklearn_text.py: 95%

39 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-08-09 08:45 +0200

1""" 

2@file 

3@brief Overloads :epkg:`TfidfVectorizer` and :epkg:`CountVectorizer`. 

4""" 

5from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

6try: 

7 from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin 

8except ImportError: # pragma: no cover 

9 # scikit-learn < 0.23 

10 from sklearn.feature_extraction.text import VectorizerMixin 

11 

12 

13class NGramsMixin(VectorizerMixin): 

14 """ 

15 Overloads method `_word_ngrams 

16 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_ 

17 to get tuples instead of string in member `vocabulary_ 

18 <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_. 

19 of :epkg:`TfidfVectorizer` or :epkg:`CountVectorizer`. 

20 It contains the list of n-grams used to process documents. 

21 See @see cl TraceableCountVectorizer and @see cl TraceableTfidfVectorizer 

22 for example. 

23 """ 

24 

25 def _word_ngrams(self, tokens, stop_words=None): 

26 """Turn tokens into a sequence of n-grams after stop words filtering""" 

27 # handle stop words 

28 if tokens is not None: 

29 new_tokens = [] 

30 for token in tokens: 

31 new_tokens.append( 

32 (token,) if isinstance(token, str) else token) 

33 tokens = new_tokens 

34 

35 if stop_words is not None: 

36 tokens = [(w, ) for w in tokens if w not in stop_words] 

37 

38 # handle token n-grams 

39 min_n, max_n = self.ngram_range 

40 if max_n != 1: 

41 original_tokens = tokens 

42 if min_n == 1: 

43 # no need to do any slicing for unigrams 

44 # just iterate through the original tokens 

45 tokens = list(original_tokens) 

46 min_n += 1 

47 else: 

48 tokens = [] 

49 

50 n_original_tokens = len(original_tokens) 

51 

52 # bind method outside of loop to reduce overhead 

53 tokens_append = tokens.append 

54 

55 def space_join(tokens): 

56 new_tokens = [] 

57 for token in tokens: 

58 if isinstance(token, str): 

59 new_tokens.append(token) 

60 elif isinstance(token, tuple): 

61 new_tokens.extend(token) 

62 else: 

63 raise TypeError( # pragma: no cover 

64 f"Unable to build a n-grams out of {tokens}.") 

65 return tuple(new_tokens) 

66 

67 for n in range(min_n, 

68 min(max_n + 1, n_original_tokens + 1)): 

69 for i in range(n_original_tokens - n + 1): 

70 tokens_append(space_join(original_tokens[i: i + n])) 

71 return tokens 

72 

73 

74class TraceableCountVectorizer(CountVectorizer, NGramsMixin): 

75 """ 

76 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams 

77 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_ 

78 to keep more information about n-grams but still produces the same 

79 outputs than :epkg:`CountVectorizer`. 

80 

81 .. runpython:: 

82 :showcode: 

83 

84 import numpy 

85 from sklearn.feature_extraction.text import CountVectorizer 

86 from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer 

87 from pprint import pformat 

88 

89 corpus = numpy.array([ 

90 "This is the first document.", 

91 "This document is the second document.", 

92 "Is this the first document?", 

93 "", 

94 ]).reshape((4, )) 

95 

96 print('CountVectorizer from scikit-learn') 

97 mod1 = CountVectorizer(ngram_range=(1, 2)) 

98 mod1.fit(corpus) 

99 print(mod1.transform(corpus).todense()[:2]) 

100 print(pformat(mod1.vocabulary_)[:100]) 

101 

102 print('TraceableCountVectorizer from scikit-learn') 

103 mod2 = TraceableCountVectorizer(ngram_range=(1, 2)) 

104 mod2.fit(corpus) 

105 print(mod2.transform(corpus).todense()[:2]) 

106 print(pformat(mod2.vocabulary_)[:100]) 

107 

108 A weirder example with 

109 @see cl TraceableTfidfVectorizer shows more differences. 

110 """ 

111 

112 def _word_ngrams(self, tokens, stop_words=None): 

113 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words) 

114 

115 

116class TraceableTfidfVectorizer(TfidfVectorizer, NGramsMixin): 

117 """ 

118 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams 

119 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_ 

120 to keep more information about n-grams but still produces the same 

121 outputs than :epkg:`TfidfVectorizer`. 

122 

123 .. runpython:: 

124 :showcode: 

125 

126 import numpy 

127 from sklearn.feature_extraction.text import TfidfVectorizer 

128 from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer 

129 from pprint import pformat 

130 

131 corpus = numpy.array([ 

132 "This is the first document.", 

133 "This document is the second document.", 

134 "Is this the first document?", 

135 "", 

136 ]).reshape((4, )) 

137 

138 print('TfidfVectorizer from scikit-learn') 

139 mod1 = TfidfVectorizer(ngram_range=(1, 2), 

140 token_pattern="[a-zA-Z ]{1,4}") 

141 mod1.fit(corpus) 

142 print(mod1.transform(corpus).todense()[:2]) 

143 print(pformat(mod1.vocabulary_)[:100]) 

144 

145 print('TraceableTfidfVectorizer from scikit-learn') 

146 mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2), 

147 token_pattern="[a-zA-Z ]{1,4}") 

148 mod2.fit(corpus) 

149 print(mod2.transform(corpus).todense()[:2]) 

150 print(pformat(mod2.vocabulary_)[:100]) 

151 """ 

152 

153 def _word_ngrams(self, tokens, stop_words=None): 

154 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)