Coverage for mlinsights/mlmodel/sklearn_text.py: 95%
39 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-28 08:46 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-28 08:46 +0100
1"""
2@file
3@brief Overloads :epkg:`TfidfVectorizer` and :epkg:`CountVectorizer`.
4"""
5from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
6try:
7 from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin
8except ImportError: # pragma: no cover
9 # scikit-learn < 0.23
10 from sklearn.feature_extraction.text import VectorizerMixin
13class NGramsMixin(VectorizerMixin):
14 """
15 Overloads method `_word_ngrams
16 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_
17 to get tuples instead of string in member `vocabulary_
18 <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.
19 of :epkg:`TfidfVectorizer` or :epkg:`CountVectorizer`.
20 It contains the list of n-grams used to process documents.
21 See @see cl TraceableCountVectorizer and @see cl TraceableTfidfVectorizer
22 for example.
23 """
25 def _word_ngrams(self, tokens, stop_words=None):
26 """Turn tokens into a sequence of n-grams after stop words filtering"""
27 # handle stop words
28 if tokens is not None:
29 new_tokens = []
30 for token in tokens:
31 new_tokens.append(
32 (token,) if isinstance(token, str) else token)
33 tokens = new_tokens
35 if stop_words is not None:
36 tokens = [(w, ) for w in tokens if w not in stop_words]
38 # handle token n-grams
39 min_n, max_n = self.ngram_range
40 if max_n != 1:
41 original_tokens = tokens
42 if min_n == 1:
43 # no need to do any slicing for unigrams
44 # just iterate through the original tokens
45 tokens = list(original_tokens)
46 min_n += 1
47 else:
48 tokens = []
50 n_original_tokens = len(original_tokens)
52 # bind method outside of loop to reduce overhead
53 tokens_append = tokens.append
55 def space_join(tokens):
56 new_tokens = []
57 for token in tokens:
58 if isinstance(token, str):
59 new_tokens.append(token)
60 elif isinstance(token, tuple):
61 new_tokens.extend(token)
62 else:
63 raise TypeError( # pragma: no cover
64 f"Unable to build a n-grams out of {tokens}.")
65 return tuple(new_tokens)
67 for n in range(min_n,
68 min(max_n + 1, n_original_tokens + 1)):
69 for i in range(n_original_tokens - n + 1):
70 tokens_append(space_join(original_tokens[i: i + n]))
71 return tokens
74class TraceableCountVectorizer(CountVectorizer, NGramsMixin):
75 """
76 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams
77 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_
78 to keep more information about n-grams but still produces the same
79 outputs than :epkg:`CountVectorizer`.
81 .. runpython::
82 :showcode:
84 import numpy
85 from sklearn.feature_extraction.text import CountVectorizer
86 from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer
87 from pprint import pformat
89 corpus = numpy.array([
90 "This is the first document.",
91 "This document is the second document.",
92 "Is this the first document?",
93 "",
94 ]).reshape((4, ))
96 print('CountVectorizer from scikit-learn')
97 mod1 = CountVectorizer(ngram_range=(1, 2))
98 mod1.fit(corpus)
99 print(mod1.transform(corpus).todense()[:2])
100 print(pformat(mod1.vocabulary_)[:100])
102 print('TraceableCountVectorizer from scikit-learn')
103 mod2 = TraceableCountVectorizer(ngram_range=(1, 2))
104 mod2.fit(corpus)
105 print(mod2.transform(corpus).todense()[:2])
106 print(pformat(mod2.vocabulary_)[:100])
108 A weirder example with
109 @see cl TraceableTfidfVectorizer shows more differences.
110 """
112 def _word_ngrams(self, tokens, stop_words=None):
113 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)
116class TraceableTfidfVectorizer(TfidfVectorizer, NGramsMixin):
117 """
118 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams
119 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_
120 to keep more information about n-grams but still produces the same
121 outputs than :epkg:`TfidfVectorizer`.
123 .. runpython::
124 :showcode:
126 import numpy
127 from sklearn.feature_extraction.text import TfidfVectorizer
128 from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer
129 from pprint import pformat
131 corpus = numpy.array([
132 "This is the first document.",
133 "This document is the second document.",
134 "Is this the first document?",
135 "",
136 ]).reshape((4, ))
138 print('TfidfVectorizer from scikit-learn')
139 mod1 = TfidfVectorizer(ngram_range=(1, 2),
140 token_pattern="[a-zA-Z ]{1,4}")
141 mod1.fit(corpus)
142 print(mod1.transform(corpus).todense()[:2])
143 print(pformat(mod1.vocabulary_)[:100])
145 print('TraceableTfidfVectorizer from scikit-learn')
146 mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2),
147 token_pattern="[a-zA-Z ]{1,4}")
148 mod2.fit(corpus)
149 print(mod2.transform(corpus).todense()[:2])
150 print(pformat(mod2.vocabulary_)[:100])
151 """
153 def _word_ngrams(self, tokens, stop_words=None):
154 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)