123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- # Natural Language Toolkit: Interface to scikit-learn classifiers
- #
- # Author: Lars Buitinck <L.J.Buitinck@uva.nl>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- scikit-learn (http://scikit-learn.org) is a machine learning library for
- Python. It supports many classification algorithms, including SVMs,
- Naive Bayes, logistic regression (MaxEnt) and decision trees.
- This package implements a wrapper around scikit-learn classifiers. To use this
- wrapper, construct a scikit-learn estimator object, then use that to construct
- a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
- >>> from sklearn.svm import LinearSVC
- >>> from nltk.classify.scikitlearn import SklearnClassifier
- >>> classif = SklearnClassifier(LinearSVC())
- A scikit-learn classifier may include preprocessing steps when it's wrapped
- in a Pipeline object. The following constructs and wraps a Naive Bayes text
- classifier with tf-idf weighting and chi-square feature selection to get the
- best 1000 features:
- >>> from sklearn.feature_extraction.text import TfidfTransformer
- >>> from sklearn.feature_selection import SelectKBest, chi2
- >>> from sklearn.naive_bayes import MultinomialNB
- >>> from sklearn.pipeline import Pipeline
- >>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
- ... ('chi2', SelectKBest(chi2, k=1000)),
- ... ('nb', MultinomialNB())])
- >>> classif = SklearnClassifier(pipeline)
- """
- from __future__ import print_function, unicode_literals
- from six.moves import zip
- from nltk.classify.api import ClassifierI
- from nltk.probability import DictionaryProbDist
- from nltk import compat
- try:
- from sklearn.feature_extraction import DictVectorizer
- from sklearn.preprocessing import LabelEncoder
- except ImportError:
- pass
- __all__ = ['SklearnClassifier']
- @compat.python_2_unicode_compatible
- class SklearnClassifier(ClassifierI):
- """Wrapper for scikit-learn classifiers."""
- def __init__(self, estimator, dtype=float, sparse=True):
- """
- :param estimator: scikit-learn classifier object.
- :param dtype: data type used when building feature array.
- scikit-learn estimators work exclusively on numeric data. The
- default value should be fine for almost all situations.
- :param sparse: Whether to use sparse matrices internally.
- The estimator must support these; not all scikit-learn classifiers
- do (see their respective documentation and look for "sparse
- matrix"). The default value is True, since most NLP problems
- involve sparse feature sets. Setting this to False may take a
- great amount of memory.
- :type sparse: boolean.
- """
- self._clf = estimator
- self._encoder = LabelEncoder()
- self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
- def __repr__(self):
- return "<SklearnClassifier(%r)>" % self._clf
- def classify_many(self, featuresets):
- """Classify a batch of samples.
- :param featuresets: An iterable over featuresets, each a dict mapping
- strings to either numbers, booleans or strings.
- :return: The predicted class label for each input sample.
- :rtype: list
- """
- X = self._vectorizer.transform(featuresets)
- classes = self._encoder.classes_
- return [classes[i] for i in self._clf.predict(X)]
- def prob_classify_many(self, featuresets):
- """Compute per-class probabilities for a batch of samples.
- :param featuresets: An iterable over featuresets, each a dict mapping
- strings to either numbers, booleans or strings.
- :rtype: list of ``ProbDistI``
- """
- X = self._vectorizer.transform(featuresets)
- y_proba_list = self._clf.predict_proba(X)
- return [self._make_probdist(y_proba) for y_proba in y_proba_list]
- def labels(self):
- """The class labels used by this classifier.
- :rtype: list
- """
- return list(self._encoder.classes_)
- def train(self, labeled_featuresets):
- """
- Train (fit) the scikit-learn estimator.
- :param labeled_featuresets: A list of ``(featureset, label)``
- where each ``featureset`` is a dict mapping strings to either
- numbers, booleans or strings.
- """
- X, y = list(zip(*labeled_featuresets))
- X = self._vectorizer.fit_transform(X)
- y = self._encoder.fit_transform(y)
- self._clf.fit(X, y)
- return self
- def _make_probdist(self, y_proba):
- classes = self._encoder.classes_
- return DictionaryProbDist(dict((classes[i], p) for i, p in enumerate(y_proba)))
- # skip doctests if scikit-learn is not installed
- def setup_module(module):
- from nose import SkipTest
- try:
- import sklearn
- except ImportError:
- raise SkipTest("scikit-learn is not installed")
- if __name__ == "__main__":
- from nltk.classify.util import names_demo, names_demo_features
- from sklearn.linear_model import LogisticRegression
- from sklearn.naive_bayes import BernoulliNB
- # Bernoulli Naive Bayes is designed for binary classification. We set the
- # binarize option to False since we know we're passing boolean features.
- print("scikit-learn Naive Bayes:")
- names_demo(
- SklearnClassifier(BernoulliNB(binarize=False)).train,
- features=names_demo_features,
- )
- # The C parameter on logistic regression (MaxEnt) controls regularization.
- # The higher it's set, the less regularized the classifier is.
- print("\n\nscikit-learn logistic regression:")
- names_demo(
- SklearnClassifier(LogisticRegression(C=1000)).train,
- features=names_demo_features,
- )
|