Coverage for mlinsights/mlmodel/sklearn

1"""

2@file

3@brief Overloads :epkg:`TfidfVectorizer` and :epkg:`CountVectorizer`.

4"""

5from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

6try:

7 from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin

8except ImportError: # pragma: no cover

9 # scikit-learn < 0.23

10 from sklearn.feature_extraction.text import VectorizerMixin

13class NGramsMixin(VectorizerMixin):

14 """

15 Overloads method `_word_ngrams

16 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_

17 to get tuples instead of string in member `vocabulary_

18 <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.

19 of :epkg:`TfidfVectorizer` or :epkg:`CountVectorizer`.

20 It contains the list of n-grams used to process documents.

21 See @see cl TraceableCountVectorizer and @see cl TraceableTfidfVectorizer

22 for example.

23 """

25 def _word_ngrams(self, tokens, stop_words=None):

26 """Turn tokens into a sequence of n-grams after stop words filtering"""

27 # handle stop words

28 if tokens is not None:

29 new_tokens = []

30 for token in tokens:

31 new_tokens.append(

32 (token,) if isinstance(token, str) else token)

33 tokens = new_tokens

35 if stop_words is not None:

36 tokens = [(w, ) for w in tokens if w not in stop_words]

38 # handle token n-grams

39 min_n, max_n = self.ngram_range

40 if max_n != 1:

41 original_tokens = tokens

42 if min_n == 1:

43 # no need to do any slicing for unigrams

44 # just iterate through the original tokens

45 tokens = list(original_tokens)

46 min_n += 1

47 else:

48 tokens = []

50 n_original_tokens = len(original_tokens)

52 # bind method outside of loop to reduce overhead

53 tokens_append = tokens.append

55 def space_join(tokens):

56 new_tokens = []

57 for token in tokens:

58 if isinstance(token, str):

59 new_tokens.append(token)

60 elif isinstance(token, tuple):

61 new_tokens.extend(token)

62 else:

63 raise TypeError( # pragma: no cover

64 f"Unable to build a n-grams out of {tokens}.")

65 return tuple(new_tokens)

67 for n in range(min_n,

68 min(max_n + 1, n_original_tokens + 1)):

69 for i in range(n_original_tokens - n + 1):

70 tokens_append(space_join(original_tokens[i: i + n]))

71 return tokens

74class TraceableCountVectorizer(CountVectorizer, NGramsMixin):

75 """

76 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams

77 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_

78 to keep more information about n-grams but still produces the same

79 outputs than :epkg:`CountVectorizer`.

81 .. runpython::

82 :showcode:

84 import numpy

85 from sklearn.feature_extraction.text import CountVectorizer

86 from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer

87 from pprint import pformat

89 corpus = numpy.array([

90 "This is the first document.",

91 "This document is the second document.",

92 "Is this the first document?",

93 "",

94 ]).reshape((4, ))

96 print('CountVectorizer from scikit-learn')

97 mod1 = CountVectorizer(ngram_range=(1, 2))

98 mod1.fit(corpus)

99 print(mod1.transform(corpus).todense()[:2])

100 print(pformat(mod1.vocabulary_)[:100])

101

102 print('TraceableCountVectorizer from scikit-learn')

103 mod2 = TraceableCountVectorizer(ngram_range=(1, 2))

104 mod2.fit(corpus)

105 print(mod2.transform(corpus).todense()[:2])

106 print(pformat(mod2.vocabulary_)[:100])

107

108 A weirder example with

109 @see cl TraceableTfidfVectorizer shows more differences.

110 """

111

112 def _word_ngrams(self, tokens, stop_words=None):

113 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)

114

115

116class TraceableTfidfVectorizer(TfidfVectorizer, NGramsMixin):

117 """

118 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams

119 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_

120 to keep more information about n-grams but still produces the same

121 outputs than :epkg:`TfidfVectorizer`.

122

123 .. runpython::

124 :showcode:

125

126 import numpy

127 from sklearn.feature_extraction.text import TfidfVectorizer

128 from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer

129 from pprint import pformat

130

131 corpus = numpy.array([

132 "This is the first document.",

133 "This document is the second document.",

134 "Is this the first document?",

135 "",

136 ]).reshape((4, ))

137

138 print('TfidfVectorizer from scikit-learn')

139 mod1 = TfidfVectorizer(ngram_range=(1, 2),

140 token_pattern="[a-zA-Z ]{1,4}")

141 mod1.fit(corpus)

142 print(mod1.transform(corpus).todense()[:2])

143 print(pformat(mod1.vocabulary_)[:100])

144

145 print('TraceableTfidfVectorizer from scikit-learn')

146 mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2),

147 token_pattern="[a-zA-Z ]{1,4}")

148 mod2.fit(corpus)

149 print(mod2.transform(corpus).todense()[:2])

150 print(pformat(mod2.vocabulary_)[:100])

151 """

152

153 def _word_ngrams(self, tokens, stop_words=None):

154 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)

Coverage for mlinsights/mlmodel/sklearn_text.py: 95%

39 statements