12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667 |
- # Natural Language Toolkit: Punkt sentence tokenizer
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Algorithm: Kiss & Strunk (2006)
- # Author: Willy <willy@csse.unimelb.edu.au> (original Python port)
- # Steven Bird <stevenbird1@gmail.com> (additions)
- # Edward Loper <edloper@gmail.com> (rewrite)
- # Joel Nothman <jnothman@student.usyd.edu.au> (almost rewrite)
- # Arthur Darcet <arthur@darcet.fr> (fixes)
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- r"""
- Punkt Sentence Tokenizer
- This tokenizer divides a text into a list of sentences
- by using an unsupervised algorithm to build a model for abbreviation
- words, collocations, and words that start sentences. It must be
- trained on a large collection of plaintext in the target language
- before it can be used.
- The NLTK data package includes a pre-trained Punkt tokenizer for
- English.
- >>> import nltk.data
- >>> text = '''
- ... Punkt knows that the periods in Mr. Smith and Johann S. Bach
- ... do not mark sentence boundaries. And sometimes sentences
- ... can start with non-capitalized words. i is a good variable
- ... name.
- ... '''
- >>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
- >>> print('\n-----\n'.join(sent_detector.tokenize(text.strip())))
- Punkt knows that the periods in Mr. Smith and Johann S. Bach
- do not mark sentence boundaries.
- -----
- And sometimes sentences
- can start with non-capitalized words.
- -----
- i is a good variable
- name.
- (Note that whitespace from the original text, including newlines, is
- retained in the output.)
- Punctuation following sentences is also included by default
- (from NLTK 3.0 onwards). It can be excluded with the realign_boundaries
- flag.
- >>> text = '''
- ... (How does it deal with this parenthesis?) "It should be part of the
- ... previous sentence." "(And the same with this one.)" ('And this one!')
- ... "('(And (this)) '?)" [(and this. )]
- ... '''
- >>> print('\n-----\n'.join(
- ... sent_detector.tokenize(text.strip())))
- (How does it deal with this parenthesis?)
- -----
- "It should be part of the
- previous sentence."
- -----
- "(And the same with this one.)"
- -----
- ('And this one!')
- -----
- "('(And (this)) '?)"
- -----
- [(and this. )]
- >>> print('\n-----\n'.join(
- ... sent_detector.tokenize(text.strip(), realign_boundaries=False)))
- (How does it deal with this parenthesis?
- -----
- ) "It should be part of the
- previous sentence.
- -----
- " "(And the same with this one.
- -----
- )" ('And this one!
- -----
- ')
- "('(And (this)) '?
- -----
- )" [(and this.
- -----
- )]
- However, Punkt is designed to learn parameters (a list of abbreviations, etc.)
- unsupervised from a corpus similar to the target domain. The pre-packaged models
- may therefore be unsuitable: use ``PunktSentenceTokenizer(text)`` to learn
- parameters from the given text.
- :class:`.PunktTrainer` learns parameters such as a list of abbreviations
- (without supervision) from portions of text. Using a ``PunktTrainer`` directly
- allows for incremental training and modification of the hyper-parameters used
- to decide what is considered an abbreviation, etc.
- The algorithm for this tokenizer is described in::
- Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
- Boundary Detection. Computational Linguistics 32: 485-525.
- """
- from __future__ import print_function, unicode_literals, division
- # TODO: Make orthographic heuristic less susceptible to overtraining
- # TODO: Frequent sentence starters optionally exclude always-capitalised words
- # FIXME: Problem with ending string with e.g. '!!!' -> '!! !'
- import re
- import math
- from collections import defaultdict
- from six import string_types
- from nltk.compat import unicode_repr, python_2_unicode_compatible
- from nltk.probability import FreqDist
- from nltk.tokenize.api import TokenizerI
- ######################################################################
- # { Orthographic Context Constants
- ######################################################################
- # The following constants are used to describe the orthographic
- # contexts in which a word can occur. BEG=beginning, MID=middle,
- # UNK=unknown, UC=uppercase, LC=lowercase, NC=no case.
- _ORTHO_BEG_UC = 1 << 1
- """Orthographic context: beginning of a sentence with upper case."""
- _ORTHO_MID_UC = 1 << 2
- """Orthographic context: middle of a sentence with upper case."""
- _ORTHO_UNK_UC = 1 << 3
- """Orthographic context: unknown position in a sentence with upper case."""
- _ORTHO_BEG_LC = 1 << 4
- """Orthographic context: beginning of a sentence with lower case."""
- _ORTHO_MID_LC = 1 << 5
- """Orthographic context: middle of a sentence with lower case."""
- _ORTHO_UNK_LC = 1 << 6
- """Orthographic context: unknown position in a sentence with lower case."""
- _ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC
- """Orthographic context: occurs with upper case."""
- _ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC
- """Orthographic context: occurs with lower case."""
- _ORTHO_MAP = {
- ('initial', 'upper'): _ORTHO_BEG_UC,
- ('internal', 'upper'): _ORTHO_MID_UC,
- ('unknown', 'upper'): _ORTHO_UNK_UC,
- ('initial', 'lower'): _ORTHO_BEG_LC,
- ('internal', 'lower'): _ORTHO_MID_LC,
- ('unknown', 'lower'): _ORTHO_UNK_LC,
- }
- """A map from context position and first-letter case to the
- appropriate orthographic context flag."""
- # } (end orthographic context constants)
- ######################################################################
- ######################################################################
- # { Decision reasons for debugging
- ######################################################################
- REASON_DEFAULT_DECISION = 'default decision'
- REASON_KNOWN_COLLOCATION = 'known collocation (both words)'
- REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = 'abbreviation + orthographic heuristic'
- REASON_ABBR_WITH_SENTENCE_STARTER = 'abbreviation + frequent sentence starter'
- REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
- REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
- REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = (
- 'initial + special orthographic heuristic'
- )
- # } (end decision reasons for debugging)
- ######################################################################
- ######################################################################
- # { Language-dependent variables
- ######################################################################
- class PunktLanguageVars(object):
- """
- Stores variables, mostly regular expressions, which may be
- language-dependent for correct application of the algorithm.
- An extension of this class may modify its properties to suit
- a language other than English; an instance can then be passed
- as an argument to PunktSentenceTokenizer and PunktTrainer
- constructors.
- """
- __slots__ = ('_re_period_context', '_re_word_tokenizer')
- def __getstate__(self):
- # All modifications to the class are performed by inheritance.
- # Non-default parameters to be pickled must be defined in the inherited
- # class.
- return 1
- def __setstate__(self, state):
- return 1
- sent_end_chars = ('.', '?', '!')
- """Characters which are candidates for sentence boundaries"""
- @property
- def _re_sent_end_chars(self):
- return '[%s]' % re.escape(''.join(self.sent_end_chars))
- internal_punctuation = ',:;' # might want to extend this..
- """sentence internal punctuation, which indicates an abbreviation if
- preceded by a period-final token."""
- re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)', re.MULTILINE)
- """Used to realign punctuation that should be included in a sentence
- although it follows the period (or ?, !)."""
- _re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]"
- """Excludes some characters from starting word tokens"""
- _re_non_word_chars = r"(?:[?!)\";}\]\*:@\'\({\[])"
- """Characters that cannot appear within words"""
- _re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)"
- """Hyphen and ellipsis are multi-character punctuation"""
- _word_tokenize_fmt = r'''(
- %(MultiChar)s
- |
- (?=%(WordStart)s)\S+? # Accept word characters until end is found
- (?= # Sequences marking a word's end
- \s| # White-space
- $| # End-of-string
- %(NonWord)s|%(MultiChar)s| # Punctuation
- ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word
- )
- |
- \S
- )'''
- """Format of a regular expression to split punctuation from words,
- excluding period."""
- def _word_tokenizer_re(self):
- """Compiles and returns a regular expression for word tokenization"""
- try:
- return self._re_word_tokenizer
- except AttributeError:
- self._re_word_tokenizer = re.compile(
- self._word_tokenize_fmt
- % {
- 'NonWord': self._re_non_word_chars,
- 'MultiChar': self._re_multi_char_punct,
- 'WordStart': self._re_word_start,
- },
- re.UNICODE | re.VERBOSE,
- )
- return self._re_word_tokenizer
- def word_tokenize(self, s):
- """Tokenize a string to split off punctuation other than periods"""
- return self._word_tokenizer_re().findall(s)
- _period_context_fmt = r"""
- \S* # some word material
- %(SentEndChars)s # a potential sentence ending
- (?=(?P<after_tok>
- %(NonWord)s # either other punctuation
- |
- \s+(?P<next_tok>\S+) # or whitespace and some other token
- ))"""
- """Format of a regular expression to find contexts including possible
- sentence boundaries. Matches token which the possible sentence boundary
- ends, and matches the following token within a lookahead expression."""
- def period_context_re(self):
- """Compiles and returns a regular expression to find contexts
- including possible sentence boundaries."""
- try:
- return self._re_period_context
- except:
- self._re_period_context = re.compile(
- self._period_context_fmt
- % {
- 'NonWord': self._re_non_word_chars,
- 'SentEndChars': self._re_sent_end_chars,
- },
- re.UNICODE | re.VERBOSE,
- )
- return self._re_period_context
- _re_non_punct = re.compile(r'[^\W\d]', re.UNICODE)
- """Matches token types that are not merely punctuation. (Types for
- numeric tokens are changed to ##number## and hence contain alpha.)"""
- # }
- ######################################################################
- # ////////////////////////////////////////////////////////////
- # { Helper Functions
- # ////////////////////////////////////////////////////////////
- def _pair_iter(it):
- """
- Yields pairs of tokens from the given iterator such that each input
- token will appear as the first element in a yielded tuple. The last
- pair will have None as its second element.
- """
- it = iter(it)
- try:
- prev = next(it)
- except StopIteration:
- return
- for el in it:
- yield (prev, el)
- prev = el
- yield (prev, None)
- ######################################################################
- # { Punkt Parameters
- ######################################################################
- class PunktParameters(object):
- """Stores data used to perform sentence boundary detection with Punkt."""
- def __init__(self):
- self.abbrev_types = set()
- """A set of word types for known abbreviations."""
- self.collocations = set()
- """A set of word type tuples for known common collocations
- where the first word ends in a period. E.g., ('S.', 'Bach')
- is a common collocation in a text that discusses 'Johann
- S. Bach'. These count as negative evidence for sentence
- boundaries."""
- self.sent_starters = set()
- """A set of word types for words that often appear at the
- beginning of sentences."""
- self.ortho_context = defaultdict(int)
- """A dictionary mapping word types to the set of orthographic
- contexts that word type appears in. Contexts are represented
- by adding orthographic context flags: ..."""
- def clear_abbrevs(self):
- self.abbrev_types = set()
- def clear_collocations(self):
- self.collocations = set()
- def clear_sent_starters(self):
- self.sent_starters = set()
- def clear_ortho_context(self):
- self.ortho_context = defaultdict(int)
- def add_ortho_context(self, typ, flag):
- self.ortho_context[typ] |= flag
- def _debug_ortho_context(self, typ):
- c = self.ortho_context[typ]
- if c & _ORTHO_BEG_UC:
- yield 'BEG-UC'
- if c & _ORTHO_MID_UC:
- yield 'MID-UC'
- if c & _ORTHO_UNK_UC:
- yield 'UNK-UC'
- if c & _ORTHO_BEG_LC:
- yield 'BEG-LC'
- if c & _ORTHO_MID_LC:
- yield 'MID-LC'
- if c & _ORTHO_UNK_LC:
- yield 'UNK-LC'
- ######################################################################
- # { PunktToken
- ######################################################################
- @python_2_unicode_compatible
- class PunktToken(object):
- """Stores a token of text with annotations produced during
- sentence boundary detection."""
- _properties = ['parastart', 'linestart', 'sentbreak', 'abbr', 'ellipsis']
- __slots__ = ['tok', 'type', 'period_final'] + _properties
- def __init__(self, tok, **params):
- self.tok = tok
- self.type = self._get_type(tok)
- self.period_final = tok.endswith('.')
- for p in self._properties:
- setattr(self, p, None)
- for k in params:
- setattr(self, k, params[k])
- # ////////////////////////////////////////////////////////////
- # { Regular expressions for properties
- # ////////////////////////////////////////////////////////////
- # Note: [A-Za-z] is approximated by [^\W\d] in the general case.
- _RE_ELLIPSIS = re.compile(r'\.\.+$')
- _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$')
- _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE)
- _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE)
- # ////////////////////////////////////////////////////////////
- # { Derived properties
- # ////////////////////////////////////////////////////////////
- def _get_type(self, tok):
- """Returns a case-normalized representation of the token."""
- return self._RE_NUMERIC.sub('##number##', tok.lower())
- @property
- def type_no_period(self):
- """
- The type with its final period removed if it has one.
- """
- if len(self.type) > 1 and self.type[-1] == '.':
- return self.type[:-1]
- return self.type
- @property
- def type_no_sentperiod(self):
- """
- The type with its final period removed if it is marked as a
- sentence break.
- """
- if self.sentbreak:
- return self.type_no_period
- return self.type
- @property
- def first_upper(self):
- """True if the token's first character is uppercase."""
- return self.tok[0].isupper()
- @property
- def first_lower(self):
- """True if the token's first character is lowercase."""
- return self.tok[0].islower()
- @property
- def first_case(self):
- if self.first_lower:
- return 'lower'
- elif self.first_upper:
- return 'upper'
- return 'none'
- @property
- def is_ellipsis(self):
- """True if the token text is that of an ellipsis."""
- return self._RE_ELLIPSIS.match(self.tok)
- @property
- def is_number(self):
- """True if the token text is that of a number."""
- return self.type.startswith('##number##')
- @property
- def is_initial(self):
- """True if the token text is that of an initial."""
- return self._RE_INITIAL.match(self.tok)
- @property
- def is_alpha(self):
- """True if the token text is all alphabetic."""
- return self._RE_ALPHA.match(self.tok)
- @property
- def is_non_punct(self):
- """True if the token is either a number or is alphabetic."""
- return _re_non_punct.search(self.type)
- # ////////////////////////////////////////////////////////////
- # { String representation
- # ////////////////////////////////////////////////////////////
- def __repr__(self):
- """
- A string representation of the token that can reproduce it
- with eval(), which lists all the token's non-default
- annotations.
- """
- typestr = ' type=%s,' % unicode_repr(self.type) if self.type != self.tok else ''
- propvals = ', '.join(
- '%s=%s' % (p, unicode_repr(getattr(self, p)))
- for p in self._properties
- if getattr(self, p)
- )
- return '%s(%s,%s %s)' % (
- self.__class__.__name__,
- unicode_repr(self.tok),
- typestr,
- propvals,
- )
- def __str__(self):
- """
- A string representation akin to that used by Kiss and Strunk.
- """
- res = self.tok
- if self.abbr:
- res += '<A>'
- if self.ellipsis:
- res += '<E>'
- if self.sentbreak:
- res += '<S>'
- return res
- ######################################################################
- # { Punkt base class
- ######################################################################
- class PunktBaseClass(object):
- """
- Includes common components of PunktTrainer and PunktSentenceTokenizer.
- """
- def __init__(self, lang_vars=None, token_cls=PunktToken, params=None):
- if lang_vars is None:
- lang_vars = PunktLanguageVars()
- if params is None:
- params = PunktParameters()
- self._params = params
- self._lang_vars = lang_vars
- self._Token = token_cls
- """The collection of parameters that determines the behavior
- of the punkt tokenizer."""
- # ////////////////////////////////////////////////////////////
- # { Word tokenization
- # ////////////////////////////////////////////////////////////
- def _tokenize_words(self, plaintext):
- """
- Divide the given text into tokens, using the punkt word
- segmentation regular expression, and generate the resulting list
- of tokens augmented as three-tuples with two boolean values for whether
- the given token occurs at the start of a paragraph or a new line,
- respectively.
- """
- parastart = False
- for line in plaintext.split('\n'):
- if line.strip():
- line_toks = iter(self._lang_vars.word_tokenize(line))
- try:
- tok = next(line_toks)
- except StopIteration:
- continue
- yield self._Token(tok, parastart=parastart, linestart=True)
- parastart = False
- for t in line_toks:
- yield self._Token(t)
- else:
- parastart = True
- # ////////////////////////////////////////////////////////////
- # { Annotation Procedures
- # ////////////////////////////////////////////////////////////
- def _annotate_first_pass(self, tokens):
- """
- Perform the first pass of annotation, which makes decisions
- based purely based on the word type of each word:
- - '?', '!', and '.' are marked as sentence breaks.
- - sequences of two or more periods are marked as ellipsis.
- - any word ending in '.' that's a known abbreviation is
- marked as an abbreviation.
- - any other word ending in '.' is marked as a sentence break.
- Return these annotations as a tuple of three sets:
- - sentbreak_toks: The indices of all sentence breaks.
- - abbrev_toks: The indices of all abbreviations.
- - ellipsis_toks: The indices of all ellipsis marks.
- """
- for aug_tok in tokens:
- self._first_pass_annotation(aug_tok)
- yield aug_tok
- def _first_pass_annotation(self, aug_tok):
- """
- Performs type-based annotation on a single token.
- """
- tok = aug_tok.tok
- if tok in self._lang_vars.sent_end_chars:
- aug_tok.sentbreak = True
- elif aug_tok.is_ellipsis:
- aug_tok.ellipsis = True
- elif aug_tok.period_final and not tok.endswith('..'):
- if (
- tok[:-1].lower() in self._params.abbrev_types
- or tok[:-1].lower().split('-')[-1] in self._params.abbrev_types
- ):
- aug_tok.abbr = True
- else:
- aug_tok.sentbreak = True
- return
- ######################################################################
- # { Punkt Trainer
- ######################################################################
- class PunktTrainer(PunktBaseClass):
- """Learns parameters used in Punkt sentence boundary detection."""
- def __init__(
- self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken
- ):
- PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls)
- self._type_fdist = FreqDist()
- """A frequency distribution giving the frequency of each
- case-normalized token type in the training data."""
- self._num_period_toks = 0
- """The number of words ending in period in the training data."""
- self._collocation_fdist = FreqDist()
- """A frequency distribution giving the frequency of all
- bigrams in the training data where the first word ends in a
- period. Bigrams are encoded as tuples of word types.
- Especially common collocations are extracted from this
- frequency distribution, and stored in
- ``_params``.``collocations <PunktParameters.collocations>``."""
- self._sent_starter_fdist = FreqDist()
- """A frequency distribution giving the frequency of all words
- that occur at the training data at the beginning of a sentence
- (after the first pass of annotation). Especially common
- sentence starters are extracted from this frequency
- distribution, and stored in ``_params.sent_starters``.
- """
- self._sentbreak_count = 0
- """The total number of sentence breaks identified in training, used for
- calculating the frequent sentence starter heuristic."""
- self._finalized = True
- """A flag as to whether the training has been finalized by finding
- collocations and sentence starters, or whether finalize_training()
- still needs to be called."""
- if train_text:
- self.train(train_text, verbose, finalize=True)
- def get_params(self):
- """
- Calculates and returns parameters for sentence boundary detection as
- derived from training."""
- if not self._finalized:
- self.finalize_training()
- return self._params
- # ////////////////////////////////////////////////////////////
- # { Customization Variables
- # ////////////////////////////////////////////////////////////
- ABBREV = 0.3
- """cut-off value whether a 'token' is an abbreviation"""
- IGNORE_ABBREV_PENALTY = False
- """allows the disabling of the abbreviation penalty heuristic, which
- exponentially disadvantages words that are found at times without a
- final period."""
- ABBREV_BACKOFF = 5
- """upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""
- COLLOCATION = 7.88
- """minimal log-likelihood value that two tokens need to be considered
- as a collocation"""
- SENT_STARTER = 30
- """minimal log-likelihood value that a token requires to be considered
- as a frequent sentence starter"""
- INCLUDE_ALL_COLLOCS = False
- """this includes as potential collocations all word pairs where the first
- word ends in a period. It may be useful in corpora where there is a lot
- of variation that makes abbreviations like Mr difficult to identify."""
- INCLUDE_ABBREV_COLLOCS = False
- """this includes as potential collocations all word pairs where the first
- word is an abbreviation. Such collocations override the orthographic
- heuristic, but not the sentence starter heuristic. This is overridden by
- INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
- and ordinals are considered."""
- """"""
- MIN_COLLOC_FREQ = 1
- """this sets a minimum bound on the number of times a bigram needs to
- appear before it can be considered a collocation, in addition to log
- likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""
- # ////////////////////////////////////////////////////////////
- # { Training..
- # ////////////////////////////////////////////////////////////
- def train(self, text, verbose=False, finalize=True):
- """
- Collects training data from a given text. If finalize is True, it
- will determine all the parameters for sentence boundary detection. If
- not, this will be delayed until get_params() or finalize_training() is
- called. If verbose is True, abbreviations found will be listed.
- """
- # Break the text into tokens; record which token indices correspond to
- # line starts and paragraph starts; and determine their types.
- self._train_tokens(self._tokenize_words(text), verbose)
- if finalize:
- self.finalize_training(verbose)
- def train_tokens(self, tokens, verbose=False, finalize=True):
- """
- Collects training data from a given list of tokens.
- """
- self._train_tokens((self._Token(t) for t in tokens), verbose)
- if finalize:
- self.finalize_training(verbose)
- def _train_tokens(self, tokens, verbose):
- self._finalized = False
- # Ensure tokens are a list
- tokens = list(tokens)
- # Find the frequency of each case-normalized type. (Don't
- # strip off final periods.) Also keep track of the number of
- # tokens that end in periods.
- for aug_tok in tokens:
- self._type_fdist[aug_tok.type] += 1
- if aug_tok.period_final:
- self._num_period_toks += 1
- # Look for new abbreviations, and for types that no longer are
- unique_types = self._unique_types(tokens)
- for abbr, score, is_add in self._reclassify_abbrev_types(unique_types):
- if score >= self.ABBREV:
- if is_add:
- self._params.abbrev_types.add(abbr)
- if verbose:
- print((' Abbreviation: [%6.4f] %s' % (score, abbr)))
- else:
- if not is_add:
- self._params.abbrev_types.remove(abbr)
- if verbose:
- print((' Removed abbreviation: [%6.4f] %s' % (score, abbr)))
- # Make a preliminary pass through the document, marking likely
- # sentence breaks, abbreviations, and ellipsis tokens.
- tokens = list(self._annotate_first_pass(tokens))
- # Check what contexts each word type can appear in, given the
- # case of its first letter.
- self._get_orthography_data(tokens)
- # We need total number of sentence breaks to find sentence starters
- self._sentbreak_count += self._get_sentbreak_count(tokens)
- # The remaining heuristics relate to pairs of tokens where the first
- # ends in a period.
- for aug_tok1, aug_tok2 in _pair_iter(tokens):
- if not aug_tok1.period_final or not aug_tok2:
- continue
- # Is the first token a rare abbreviation?
- if self._is_rare_abbrev_type(aug_tok1, aug_tok2):
- self._params.abbrev_types.add(aug_tok1.type_no_period)
- if verbose:
- print((' Rare Abbrev: %s' % aug_tok1.type))
- # Does second token have a high likelihood of starting a sentence?
- if self._is_potential_sent_starter(aug_tok2, aug_tok1):
- self._sent_starter_fdist[aug_tok2.type] += 1
- # Is this bigram a potential collocation?
- if self._is_potential_collocation(aug_tok1, aug_tok2):
- self._collocation_fdist[
- (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod)
- ] += 1
- def _unique_types(self, tokens):
- return set(aug_tok.type for aug_tok in tokens)
- def finalize_training(self, verbose=False):
- """
- Uses data that has been gathered in training to determine likely
- collocations and sentence starters.
- """
- self._params.clear_sent_starters()
- for typ, ll in self._find_sent_starters():
- self._params.sent_starters.add(typ)
- if verbose:
- print((' Sent Starter: [%6.4f] %r' % (ll, typ)))
- self._params.clear_collocations()
- for (typ1, typ2), ll in self._find_collocations():
- self._params.collocations.add((typ1, typ2))
- if verbose:
- print((' Collocation: [%6.4f] %r+%r' % (ll, typ1, typ2)))
- self._finalized = True
- # ////////////////////////////////////////////////////////////
- # { Overhead reduction
- # ////////////////////////////////////////////////////////////
- def freq_threshold(
- self, ortho_thresh=2, type_thresh=2, colloc_thres=2, sentstart_thresh=2
- ):
- """
- Allows memory use to be reduced after much training by removing data
- about rare tokens that are unlikely to have a statistical effect with
- further training. Entries occurring above the given thresholds will be
- retained.
- """
- if ortho_thresh > 1:
- old_oc = self._params.ortho_context
- self._params.clear_ortho_context()
- for tok in self._type_fdist:
- count = self._type_fdist[tok]
- if count >= ortho_thresh:
- self._params.ortho_context[tok] = old_oc[tok]
- self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh)
- self._collocation_fdist = self._freq_threshold(
- self._collocation_fdist, colloc_thres
- )
- self._sent_starter_fdist = self._freq_threshold(
- self._sent_starter_fdist, sentstart_thresh
- )
- def _freq_threshold(self, fdist, threshold):
- """
- Returns a FreqDist containing only data with counts below a given
- threshold, as well as a mapping (None -> count_removed).
- """
- # We assume that there is more data below the threshold than above it
- # and so create a new FreqDist rather than working in place.
- res = FreqDist()
- num_removed = 0
- for tok in fdist:
- count = fdist[tok]
- if count < threshold:
- num_removed += 1
- else:
- res[tok] += count
- res[None] += num_removed
- return res
- # ////////////////////////////////////////////////////////////
- # { Orthographic data
- # ////////////////////////////////////////////////////////////
- def _get_orthography_data(self, tokens):
- """
- Collect information about whether each token type occurs
- with different case patterns (i) overall, (ii) at
- sentence-initial positions, and (iii) at sentence-internal
- positions.
- """
- # 'initial' or 'internal' or 'unknown'
- context = 'internal'
- tokens = list(tokens)
- for aug_tok in tokens:
- # If we encounter a paragraph break, then it's a good sign
- # that it's a sentence break. But err on the side of
- # caution (by not positing a sentence break) if we just
- # saw an abbreviation.
- if aug_tok.parastart and context != 'unknown':
- context = 'initial'
- # If we're at the beginning of a line, then we can't decide
- # between 'internal' and 'initial'.
- if aug_tok.linestart and context == 'internal':
- context = 'unknown'
- # Find the case-normalized type of the token. If it's a
- # sentence-final token, strip off the period.
- typ = aug_tok.type_no_sentperiod
- # Update the orthographic context table.
- flag = _ORTHO_MAP.get((context, aug_tok.first_case), 0)
- if flag:
- self._params.add_ortho_context(typ, flag)
- # Decide whether the next word is at a sentence boundary.
- if aug_tok.sentbreak:
- if not (aug_tok.is_number or aug_tok.is_initial):
- context = 'initial'
- else:
- context = 'unknown'
- elif aug_tok.ellipsis or aug_tok.abbr:
- context = 'unknown'
- else:
- context = 'internal'
- # ////////////////////////////////////////////////////////////
- # { Abbreviations
- # ////////////////////////////////////////////////////////////
- def _reclassify_abbrev_types(self, types):
- """
- (Re)classifies each given token if
- - it is period-final and not a known abbreviation; or
- - it is not period-final and is otherwise a known abbreviation
- by checking whether its previous classification still holds according
- to the heuristics of section 3.
- Yields triples (abbr, score, is_add) where abbr is the type in question,
- score is its log-likelihood with penalties applied, and is_add specifies
- whether the present type is a candidate for inclusion or exclusion as an
- abbreviation, such that:
- - (is_add and score >= 0.3) suggests a new abbreviation; and
- - (not is_add and score < 0.3) suggests excluding an abbreviation.
- """
- # (While one could recalculate abbreviations from all .-final tokens at
- # every iteration, in cases requiring efficiency, the number of tokens
- # in the present training document will be much less.)
- for typ in types:
- # Check some basic conditions, to rule out words that are
- # clearly not abbrev_types.
- if not _re_non_punct.search(typ) or typ == '##number##':
- continue
- if typ.endswith('.'):
- if typ in self._params.abbrev_types:
- continue
- typ = typ[:-1]
- is_add = True
- else:
- if typ not in self._params.abbrev_types:
- continue
- is_add = False
- # Count how many periods & nonperiods are in the
- # candidate.
- num_periods = typ.count('.') + 1
- num_nonperiods = len(typ) - num_periods + 1
- # Let <a> be the candidate without the period, and <b>
- # be the period. Find a log likelihood ratio that
- # indicates whether <ab> occurs as a single unit (high
- # value of ll), or as two independent units <a> and
- # <b> (low value of ll).
- count_with_period = self._type_fdist[typ + '.']
- count_without_period = self._type_fdist[typ]
- ll = self._dunning_log_likelihood(
- count_with_period + count_without_period,
- self._num_period_toks,
- count_with_period,
- self._type_fdist.N(),
- )
- # Apply three scaling factors to 'tweak' the basic log
- # likelihood ratio:
- # F_length: long word -> less likely to be an abbrev
- # F_periods: more periods -> more likely to be an abbrev
- # F_penalty: penalize occurrences w/o a period
- f_length = math.exp(-num_nonperiods)
- f_periods = num_periods
- f_penalty = int(self.IGNORE_ABBREV_PENALTY) or math.pow(
- num_nonperiods, -count_without_period
- )
- score = ll * f_length * f_periods * f_penalty
- yield typ, score, is_add
- def find_abbrev_types(self):
- """
- Recalculates abbreviations given type frequencies, despite no prior
- determination of abbreviations.
- This fails to include abbreviations otherwise found as "rare".
- """
- self._params.clear_abbrevs()
- tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.'))
- for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
- if score >= self.ABBREV:
- self._params.abbrev_types.add(abbr)
- # This function combines the work done by the original code's
- # functions `count_orthography_context`, `get_orthography_count`,
- # and `get_rare_abbreviations`.
- def _is_rare_abbrev_type(self, cur_tok, next_tok):
- """
- A word type is counted as a rare abbreviation if...
- - it's not already marked as an abbreviation
- - it occurs fewer than ABBREV_BACKOFF times
- - either it is followed by a sentence-internal punctuation
- mark, *or* it is followed by a lower-case word that
- sometimes appears with upper case, but never occurs with
- lower case at the beginning of sentences.
- """
- if cur_tok.abbr or not cur_tok.sentbreak:
- return False
- # Find the case-normalized type of the token. If it's
- # a sentence-final token, strip off the period.
- typ = cur_tok.type_no_sentperiod
- # Proceed only if the type hasn't been categorized as an
- # abbreviation already, and is sufficiently rare...
- count = self._type_fdist[typ] + self._type_fdist[typ[:-1]]
- if typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF:
- return False
- # Record this token as an abbreviation if the next
- # token is a sentence-internal punctuation mark.
- # [XX] :1 or check the whole thing??
- if next_tok.tok[:1] in self._lang_vars.internal_punctuation:
- return True
- # Record this type as an abbreviation if the next
- # token... (i) starts with a lower case letter,
- # (ii) sometimes occurs with an uppercase letter,
- # and (iii) never occus with an uppercase letter
- # sentence-internally.
- # [xx] should the check for (ii) be modified??
- elif next_tok.first_lower:
- typ2 = next_tok.type_no_sentperiod
- typ2ortho_context = self._params.ortho_context[typ2]
- if (typ2ortho_context & _ORTHO_BEG_UC) and not (
- typ2ortho_context & _ORTHO_MID_UC
- ):
- return True
- # ////////////////////////////////////////////////////////////
- # { Log Likelihoods
- # ////////////////////////////////////////////////////////////
- # helper for _reclassify_abbrev_types:
- @staticmethod
- def _dunning_log_likelihood(count_a, count_b, count_ab, N):
- """
- A function that calculates the modified Dunning log-likelihood
- ratio scores for abbreviation candidates. The details of how
- this works is available in the paper.
- """
- p1 = count_b / N
- p2 = 0.99
- null_hypo = count_ab * math.log(p1) + (count_a - count_ab) * math.log(1.0 - p1)
- alt_hypo = count_ab * math.log(p2) + (count_a - count_ab) * math.log(1.0 - p2)
- likelihood = null_hypo - alt_hypo
- return -2.0 * likelihood
- @staticmethod
- def _col_log_likelihood(count_a, count_b, count_ab, N):
- """
- A function that will just compute log-likelihood estimate, in
- the original paper it's described in algorithm 6 and 7.
- This *should* be the original Dunning log-likelihood values,
- unlike the previous log_l function where it used modified
- Dunning log-likelihood values
- """
- p = count_b / N
- p1 = count_ab / count_a
- try:
- p2 = (count_b - count_ab) / (N - count_a)
- except ZeroDivisionError as e:
- p2 = 1
- try:
- summand1 = count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p)
- except ValueError as e:
- summand1 = 0
- try:
- summand2 = (count_b - count_ab) * math.log(p) + (
- N - count_a - count_b + count_ab
- ) * math.log(1.0 - p)
- except ValueError as e:
- summand2 = 0
- if count_a == count_ab or p1 <= 0 or p1 >= 1:
- summand3 = 0
- else:
- summand3 = count_ab * math.log(p1) + (count_a - count_ab) * math.log(
- 1.0 - p1
- )
- if count_b == count_ab or p2 <= 0 or p2 >= 1:
- summand4 = 0
- else:
- summand4 = (count_b - count_ab) * math.log(p2) + (
- N - count_a - count_b + count_ab
- ) * math.log(1.0 - p2)
- likelihood = summand1 + summand2 - summand3 - summand4
- return -2.0 * likelihood
- # ////////////////////////////////////////////////////////////
- # { Collocation Finder
- # ////////////////////////////////////////////////////////////
- def _is_potential_collocation(self, aug_tok1, aug_tok2):
- """
- Returns True if the pair of tokens may form a collocation given
- log-likelihood statistics.
- """
- return (
- (
- self.INCLUDE_ALL_COLLOCS
- or (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr)
- or (aug_tok1.sentbreak and (aug_tok1.is_number or aug_tok1.is_initial))
- )
- and aug_tok1.is_non_punct
- and aug_tok2.is_non_punct
- )
- def _find_collocations(self):
- """
- Generates likely collocations and their log-likelihood.
- """
- for types in self._collocation_fdist:
- try:
- typ1, typ2 = types
- except TypeError:
- # types may be None after calling freq_threshold()
- continue
- if typ2 in self._params.sent_starters:
- continue
- col_count = self._collocation_fdist[types]
- typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + '.']
- typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + '.']
- if (
- typ1_count > 1
- and typ2_count > 1
- and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count)
- ):
- ll = self._col_log_likelihood(
- typ1_count, typ2_count, col_count, self._type_fdist.N()
- )
- # Filter out the not-so-collocative
- if ll >= self.COLLOCATION and (
- self._type_fdist.N() / typ1_count > typ2_count / col_count
- ):
- yield (typ1, typ2), ll
- # ////////////////////////////////////////////////////////////
- # { Sentence-Starter Finder
- # ////////////////////////////////////////////////////////////
- def _is_potential_sent_starter(self, cur_tok, prev_tok):
- """
- Returns True given a token and the token that preceds it if it
- seems clear that the token is beginning a sentence.
- """
- # If a token (i) is preceded by a sentece break that is
- # not a potential ordinal number or initial, and (ii) is
- # alphabetic, then it is a a sentence-starter.
- return (
- prev_tok.sentbreak
- and not (prev_tok.is_number or prev_tok.is_initial)
- and cur_tok.is_alpha
- )
- def _find_sent_starters(self):
- """
- Uses collocation heuristics for each candidate token to
- determine if it frequently starts sentences.
- """
- for typ in self._sent_starter_fdist:
- if not typ:
- continue
- typ_at_break_count = self._sent_starter_fdist[typ]
- typ_count = self._type_fdist[typ] + self._type_fdist[typ + '.']
- if typ_count < typ_at_break_count:
- # needed after freq_threshold
- continue
- ll = self._col_log_likelihood(
- self._sentbreak_count,
- typ_count,
- typ_at_break_count,
- self._type_fdist.N(),
- )
- if (
- ll >= self.SENT_STARTER
- and self._type_fdist.N() / self._sentbreak_count
- > typ_count / typ_at_break_count
- ):
- yield typ, ll
- def _get_sentbreak_count(self, tokens):
- """
- Returns the number of sentence breaks marked in a given set of
- augmented tokens.
- """
- return sum(1 for aug_tok in tokens if aug_tok.sentbreak)
- ######################################################################
- # { Punkt Sentence Tokenizer
- ######################################################################
- class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
- """
- A sentence tokenizer which uses an unsupervised algorithm to build
- a model for abbreviation words, collocations, and words that start
- sentences; and then uses that model to find sentence boundaries.
- This approach has been shown to work well for many European
- languages.
- """
- def __init__(
- self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken
- ):
- """
- train_text can either be the sole training text for this sentence
- boundary detector, or can be a PunktParameters object.
- """
- PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls)
- if train_text:
- self._params = self.train(train_text, verbose)
- def train(self, train_text, verbose=False):
- """
- Derives parameters from a given training text, or uses the parameters
- given. Repeated calls to this method destroy previous parameters. For
- incremental training, instantiate a separate PunktTrainer instance.
- """
- if not isinstance(train_text, string_types):
- return train_text
- return PunktTrainer(
- train_text, lang_vars=self._lang_vars, token_cls=self._Token
- ).get_params()
- # ////////////////////////////////////////////////////////////
- # { Tokenization
- # ////////////////////////////////////////////////////////////
- def tokenize(self, text, realign_boundaries=True):
- """
- Given a text, returns a list of the sentences in that text.
- """
- return list(self.sentences_from_text(text, realign_boundaries))
- def debug_decisions(self, text):
- """
- Classifies candidate periods as sentence breaks, yielding a dict for
- each that may be used to understand why the decision was made.
- See format_debug_decision() to help make this output readable.
- """
- for match in self._lang_vars.period_context_re().finditer(text):
- decision_text = match.group() + match.group('after_tok')
- tokens = self._tokenize_words(decision_text)
- tokens = list(self._annotate_first_pass(tokens))
- while not tokens[0].period_final:
- tokens.pop(0)
- yield dict(
- period_index=match.end() - 1,
- text=decision_text,
- type1=tokens[0].type,
- type2=tokens[1].type,
- type1_in_abbrs=bool(tokens[0].abbr),
- type1_is_initial=bool(tokens[0].is_initial),
- type2_is_sent_starter=tokens[1].type_no_sentperiod
- in self._params.sent_starters,
- type2_ortho_heuristic=self._ortho_heuristic(tokens[1]),
- type2_ortho_contexts=set(
- self._params._debug_ortho_context(tokens[1].type_no_sentperiod)
- ),
- collocation=(tokens[0].type_no_sentperiod, tokens[1].type_no_sentperiod)
- in self._params.collocations,
- reason=self._second_pass_annotation(tokens[0], tokens[1])
- or REASON_DEFAULT_DECISION,
- break_decision=tokens[0].sentbreak,
- )
- def span_tokenize(self, text, realign_boundaries=True):
- """
- Given a text, generates (start, end) spans of sentences
- in the text.
- """
- slices = self._slices_from_text(text)
- if realign_boundaries:
- slices = self._realign_boundaries(text, slices)
- for sl in slices:
- yield (sl.start, sl.stop)
- def sentences_from_text(self, text, realign_boundaries=True):
- """
- Given a text, generates the sentences in that text by only
- testing candidate sentence breaks. If realign_boundaries is
- True, includes in the sentence closing punctuation that
- follows the period.
- """
- return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
- def _slices_from_text(self, text):
- last_break = 0
- for match in self._lang_vars.period_context_re().finditer(text):
- context = match.group() + match.group('after_tok')
- if self.text_contains_sentbreak(context):
- yield slice(last_break, match.end())
- if match.group('next_tok'):
- # next sentence starts after whitespace
- last_break = match.start('next_tok')
- else:
- # next sentence starts at following punctuation
- last_break = match.end()
- # The last sentence should not contain trailing whitespace.
- yield slice(last_break, len(text.rstrip()))
- def _realign_boundaries(self, text, slices):
- """
- Attempts to realign punctuation that falls after the period but
- should otherwise be included in the same sentence.
- For example: "(Sent1.) Sent2." will otherwise be split as::
- ["(Sent1.", ") Sent1."].
- This method will produce::
- ["(Sent1.)", "Sent2."].
- """
- realign = 0
- for sl1, sl2 in _pair_iter(slices):
- sl1 = slice(sl1.start + realign, sl1.stop)
- if not sl2:
- if text[sl1]:
- yield sl1
- continue
- m = self._lang_vars.re_boundary_realignment.match(text[sl2])
- if m:
- yield slice(sl1.start, sl2.start + len(m.group(0).rstrip()))
- realign = m.end()
- else:
- realign = 0
- if text[sl1]:
- yield sl1
- def text_contains_sentbreak(self, text):
- """
- Returns True if the given text includes a sentence break.
- """
- found = False # used to ignore last token
- for t in self._annotate_tokens(self._tokenize_words(text)):
- if found:
- return True
- if t.sentbreak:
- found = True
- return False
- def sentences_from_text_legacy(self, text):
- """
- Given a text, generates the sentences in that text. Annotates all
- tokens, rather than just those with possible sentence breaks. Should
- produce the same results as ``sentences_from_text``.
- """
- tokens = self._annotate_tokens(self._tokenize_words(text))
- return self._build_sentence_list(text, tokens)
- def sentences_from_tokens(self, tokens):
- """
- Given a sequence of tokens, generates lists of tokens, each list
- corresponding to a sentence.
- """
- tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens))
- sentence = []
- for aug_tok in tokens:
- sentence.append(aug_tok.tok)
- if aug_tok.sentbreak:
- yield sentence
- sentence = []
- if sentence:
- yield sentence
- def _annotate_tokens(self, tokens):
- """
- Given a set of tokens augmented with markers for line-start and
- paragraph-start, returns an iterator through those tokens with full
- annotation including predicted sentence breaks.
- """
- # Make a preliminary pass through the document, marking likely
- # sentence breaks, abbreviations, and ellipsis tokens.
- tokens = self._annotate_first_pass(tokens)
- # Make a second pass through the document, using token context
- # information to change our preliminary decisions about where
- # sentence breaks, abbreviations, and ellipsis occurs.
- tokens = self._annotate_second_pass(tokens)
- ## [XX] TESTING
- # tokens = list(tokens)
- # self.dump(tokens)
- return tokens
- def _build_sentence_list(self, text, tokens):
- """
- Given the original text and the list of augmented word tokens,
- construct and return a tokenized list of sentence strings.
- """
- # Most of the work here is making sure that we put the right
- # pieces of whitespace back in all the right places.
- # Our position in the source text, used to keep track of which
- # whitespace to add:
- pos = 0
- # A regular expression that finds pieces of whitespace:
- WS_REGEXP = re.compile(r'\s*')
- sentence = ''
- for aug_tok in tokens:
- tok = aug_tok.tok
- # Find the whitespace before this token, and update pos.
- ws = WS_REGEXP.match(text, pos).group()
- pos += len(ws)
- # Some of the rules used by the punkt word tokenizer
- # strip whitespace out of the text, resulting in tokens
- # that contain whitespace in the source text. If our
- # token doesn't match, see if adding whitespace helps.
- # If so, then use the version with whitespace.
- if text[pos : pos + len(tok)] != tok:
- pat = '\s*'.join(re.escape(c) for c in tok)
- m = re.compile(pat).match(text, pos)
- if m:
- tok = m.group()
- # Move our position pointer to the end of the token.
- assert text[pos : pos + len(tok)] == tok
- pos += len(tok)
- # Add this token. If it's not at the beginning of the
- # sentence, then include any whitespace that separated it
- # from the previous token.
- if sentence:
- sentence += ws
- sentence += tok
- # If we're at a sentence break, then start a new sentence.
- if aug_tok.sentbreak:
- yield sentence
- sentence = ''
- # If the last sentence is emtpy, discard it.
- if sentence:
- yield sentence
- # [XX] TESTING
- def dump(self, tokens):
- print('writing to /tmp/punkt.new...')
- with open('/tmp/punkt.new', 'w') as outfile:
- for aug_tok in tokens:
- if aug_tok.parastart:
- outfile.write('\n\n')
- elif aug_tok.linestart:
- outfile.write('\n')
- else:
- outfile.write(' ')
- outfile.write(str(aug_tok))
- # ////////////////////////////////////////////////////////////
- # { Customization Variables
- # ////////////////////////////////////////////////////////////
- PUNCTUATION = tuple(';:,.!?')
- # ////////////////////////////////////////////////////////////
- # { Annotation Procedures
- # ////////////////////////////////////////////////////////////
- def _annotate_second_pass(self, tokens):
- """
- Performs a token-based classification (section 4) over the given
- tokens, making use of the orthographic heuristic (4.1.1), collocation
- heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
- """
- for t1, t2 in _pair_iter(tokens):
- self._second_pass_annotation(t1, t2)
- yield t1
- def _second_pass_annotation(self, aug_tok1, aug_tok2):
- """
- Performs token-based classification over a pair of contiguous tokens
- updating the first.
- """
- # Is it the last token? We can't do anything then.
- if not aug_tok2:
- return
- tok = aug_tok1.tok
- if not aug_tok1.period_final:
- # We only care about words ending in periods.
- return
- typ = aug_tok1.type_no_period
- next_tok = aug_tok2.tok
- next_typ = aug_tok2.type_no_sentperiod
- tok_is_initial = aug_tok1.is_initial
- # [4.1.2. Collocation Heuristic] If there's a
- # collocation between the word before and after the
- # period, then label tok as an abbreviation and NOT
- # a sentence break. Note that collocations with
- # frequent sentence starters as their second word are
- # excluded in training.
- if (typ, next_typ) in self._params.collocations:
- aug_tok1.sentbreak = False
- aug_tok1.abbr = True
- return REASON_KNOWN_COLLOCATION
- # [4.2. Token-Based Reclassification of Abbreviations] If
- # the token is an abbreviation or an ellipsis, then decide
- # whether we should *also* classify it as a sentbreak.
- if (aug_tok1.abbr or aug_tok1.ellipsis) and (not tok_is_initial):
- # [4.1.1. Orthographic Heuristic] Check if there's
- # orthogrpahic evidence about whether the next word
- # starts a sentence or not.
- is_sent_starter = self._ortho_heuristic(aug_tok2)
- if is_sent_starter == True:
- aug_tok1.sentbreak = True
- return REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC
- # [4.1.3. Frequent Sentence Starter Heruistic] If the
- # next word is capitalized, and is a member of the
- # frequent-sentence-starters list, then label tok as a
- # sentence break.
- if aug_tok2.first_upper and next_typ in self._params.sent_starters:
- aug_tok1.sentbreak = True
- return REASON_ABBR_WITH_SENTENCE_STARTER
- # [4.3. Token-Based Detection of Initials and Ordinals]
- # Check if any initials or ordinals tokens that are marked
- # as sentbreaks should be reclassified as abbreviations.
- if tok_is_initial or typ == '##number##':
- # [4.1.1. Orthographic Heuristic] Check if there's
- # orthogrpahic evidence about whether the next word
- # starts a sentence or not.
- is_sent_starter = self._ortho_heuristic(aug_tok2)
- if is_sent_starter == False:
- aug_tok1.sentbreak = False
- aug_tok1.abbr = True
- if tok_is_initial:
- return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC
- else:
- return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC
- # Special heuristic for initials: if orthogrpahic
- # heuristc is unknown, and next word is always
- # capitalized, then mark as abbrev (eg: J. Bach).
- if (
- is_sent_starter == 'unknown'
- and tok_is_initial
- and aug_tok2.first_upper
- and not (self._params.ortho_context[next_typ] & _ORTHO_LC)
- ):
- aug_tok1.sentbreak = False
- aug_tok1.abbr = True
- return REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC
- return
- def _ortho_heuristic(self, aug_tok):
- """
- Decide whether the given token is the first token in a sentence.
- """
- # Sentences don't start with punctuation marks:
- if aug_tok.tok in self.PUNCTUATION:
- return False
- ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod]
- # If the word is capitalized, occurs at least once with a
- # lower case first letter, and never occurs with an upper case
- # first letter sentence-internally, then it's a sentence starter.
- if (
- aug_tok.first_upper
- and (ortho_context & _ORTHO_LC)
- and not (ortho_context & _ORTHO_MID_UC)
- ):
- return True
- # If the word is lower case, and either (a) we've seen it used
- # with upper case, or (b) we've never seen it used
- # sentence-initially with lower case, then it's not a sentence
- # starter.
- if aug_tok.first_lower and (
- (ortho_context & _ORTHO_UC) or not (ortho_context & _ORTHO_BEG_LC)
- ):
- return False
- # Otherwise, we're not sure.
- return 'unknown'
- DEBUG_DECISION_FMT = '''Text: %(text)r (at offset %(period_index)d)
- Sentence break? %(break_decision)s (%(reason)s)
- Collocation? %(collocation)s
- %(type1)r:
- known abbreviation: %(type1_in_abbrs)s
- is initial: %(type1_is_initial)s
- %(type2)r:
- known sentence starter: %(type2_is_sent_starter)s
- orthographic heuristic suggests is a sentence starter? %(type2_ortho_heuristic)s
- orthographic contexts in training: %(type2_ortho_contexts)s
- '''
- def format_debug_decision(d):
- return DEBUG_DECISION_FMT % d
- def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
- """Builds a punkt model and applies it to the same text"""
- cleanup = (
- lambda s: re.compile(r'(?:\r|^\s+)', re.MULTILINE).sub('', s).replace('\n', ' ')
- )
- trainer = train_cls()
- trainer.INCLUDE_ALL_COLLOCS = True
- trainer.train(text)
- sbd = tok_cls(trainer.get_params())
- for l in sbd.sentences_from_text(text):
- print(cleanup(l))
|