module mlmodel.sklearn_text
#
Short summary#
module mlinsights.mlmodel.sklearn_text
Overloads TfidfVectorizer and CountVectorizer.
Classes#
class |
truncated documentation |
---|---|
Overloads method _word_ngrams … |
|
Inherits from |
|
Inherits from |
Properties#
property |
truncated documentation |
---|---|
|
HTML representation of estimator. This is redundant with the logic of _repr_mimebundle_. The latter should … |
|
HTML representation of estimator. This is redundant with the logic of _repr_mimebundle_. The latter should … |
|
Inverse document frequency vector, only defined if use_idf=True. Returns ——- ndarray of shape … |
Methods#
method |
truncated documentation |
---|---|
Turn tokens into a sequence of n-grams after stop words filtering |
|
Documentation#
Overloads TfidfVectorizer and CountVectorizer.
- class mlinsights.mlmodel.sklearn_text.NGramsMixin#
Bases:
_VectorizerMixin
Overloads method _word_ngrams to get tuples instead of string in member vocabulary_. of TfidfVectorizer or CountVectorizer. It contains the list of n-grams used to process documents. See
TraceableCountVectorizer
andTraceableTfidfVectorizer
for example.- _word_ngrams(tokens, stop_words=None)#
Turn tokens into a sequence of n-grams after stop words filtering
- class mlinsights.mlmodel.sklearn_text.TraceableCountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)#
Bases:
CountVectorizer
,NGramsMixin
Inherits from
NGramsMixin
which overloads method _word_ngrams to keep more information about n-grams but still produces the same outputs than CountVectorizer.<<<
import numpy from sklearn.feature_extraction.text import CountVectorizer from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer from pprint import pformat corpus = numpy.array([ "This is the first document.", "This document is the second document.", "Is this the first document?", "", ]).reshape((4, )) print('CountVectorizer from scikit-learn') mod1 = CountVectorizer(ngram_range=(1, 2)) mod1.fit(corpus) print(mod1.transform(corpus).todense()[:2]) print(pformat(mod1.vocabulary_)[:100]) print('TraceableCountVectorizer from scikit-learn') mod2 = TraceableCountVectorizer(ngram_range=(1, 2)) mod2.fit(corpus) print(mod2.transform(corpus).todense()[:2]) print(pformat(mod2.vocabulary_)[:100])
>>>
CountVectorizer from scikit-learn [[1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0] [2 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0]] {'document': 0, 'document is': 1, 'first': 2, 'first document': 3, 'is': 4, 'is the': 5, 'is t TraceableCountVectorizer from scikit-learn [[1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0] [2 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0]] {('document',): 0, ('document', 'is'): 1, ('first',): 2, ('first', 'document'): 3, ('is',): 4,
A weirder example with
TraceableTfidfVectorizer
shows more differences.- _word_ngrams(tokens, stop_words=None)#
Turn tokens into a sequence of n-grams after stop words filtering
- class mlinsights.mlmodel.sklearn_text.TraceableTfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)#
Bases:
TfidfVectorizer
,NGramsMixin
Inherits from
NGramsMixin
which overloads method _word_ngrams to keep more information about n-grams but still produces the same outputs than TfidfVectorizer.<<<
import numpy from sklearn.feature_extraction.text import TfidfVectorizer from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer from pprint import pformat corpus = numpy.array([ "This is the first document.", "This document is the second document.", "Is this the first document?", "", ]).reshape((4, )) print('TfidfVectorizer from scikit-learn') mod1 = TfidfVectorizer(ngram_range=(1, 2), token_pattern="[a-zA-Z ]{1,4}") mod1.fit(corpus) print(mod1.transform(corpus).todense()[:2]) print(pformat(mod1.vocabulary_)[:100]) print('TraceableTfidfVectorizer from scikit-learn') mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2), token_pattern="[a-zA-Z ]{1,4}") mod2.fit(corpus) print(mod2.transform(corpus).todense()[:2]) print(pformat(mod2.vocabulary_)[:100])
>>>
TfidfVectorizer from scikit-learn [[0. 0. 0.329 0.329 0. 0. 0. 0. 0.26 0.26 0. 0. 0.26 0.26 0. 0. 0. 0. 0. 0.26 0. 0. 0.26 0.26 0. 0. 0.26 0.26 0.26 0. 0.329 0. 0. ] [0.245 0.245 0. 0. 0.245 0.245 0.245 0.245 0. 0. 0.245 0.245 0. 0. 0. 0. 0. 0. 0.245 0. 0.245 0.245 0. 0. 0.245 0.245 0. 0. 0.193 0.245 0. 0.245 0.245]] {' doc': 0, ' doc umen': 1, ' is ': 2, ' is the ': 3, ' sec': 4, ' sec ond ': 5, ' the': 6, TraceableTfidfVectorizer from scikit-learn [[0. 0. 0.329 0.329 0. 0. 0. 0. 0.26 0.26 0. 0. 0.26 0.26 0. 0. 0. 0. 0. 0.26 0. 0. 0.26 0.26 0. 0. 0.26 0.26 0.26 0. 0.329 0. 0. ] [0.245 0.245 0. 0. 0.245 0.245 0.245 0.245 0. 0. 0.245 0.245 0. 0. 0. 0. 0. 0. 0.245 0. 0.245 0.245 0. 0. 0.245 0.245 0. 0. 0.193 0.245 0. 0.245 0.245]] {(' doc',): 0, (' doc', 'umen'): 1, (' is ',): 2, (' is ', 'the '): 3, (' sec',): 4, (' sec', '
- _word_ngrams(tokens, stop_words=None)#
Turn tokens into a sequence of n-grams after stop words filtering