12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- # Natural Language Toolkit: Word Sense Disambiguation Algorithms
- #
- # Authors: Liling Tan <alvations@gmail.com>,
- # Dmitrijs Milajevs <dimazest@gmail.com>
- #
- # Copyright (C) 2001-2019 NLTK Project
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from nltk.corpus import wordnet
- def lesk(context_sentence, ambiguous_word, pos=None, synsets=None):
- """Return a synset for an ambiguous word in a context.
- :param iter context_sentence: The context sentence where the ambiguous word
- occurs, passed as an iterable of words.
- :param str ambiguous_word: The ambiguous word that requires WSD.
- :param str pos: A specified Part-of-Speech (POS).
- :param iter synsets: Possible synsets of the ambiguous word.
- :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
- This function is an implementation of the original Lesk algorithm (1986) [1].
- Usage example::
- >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n')
- Synset('savings_bank.n.02')
- [1] Lesk, Michael. "Automatic sense disambiguation using machine
- readable dictionaries: how to tell a pine cone from an ice cream
- cone." Proceedings of the 5th Annual International Conference on
- Systems Documentation. ACM, 1986.
- http://dl.acm.org/citation.cfm?id=318728
- """
- context = set(context_sentence)
- if synsets is None:
- synsets = wordnet.synsets(ambiguous_word)
- if pos:
- synsets = [ss for ss in synsets if str(ss.pos()) == pos]
- if not synsets:
- return None
- _, sense = max(
- (len(context.intersection(ss.definition().split())), ss) for ss in synsets
- )
- return sense
|