123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549 |
- # Natural Language Toolkit: Relation Extraction
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Ewan Klein <ewan@inf.ed.ac.uk>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Code for extracting relational triples from the ieer and conll2002 corpora.
- Relations are stored internally as dictionaries ('reldicts').
- The two serialization outputs are "rtuple" and "clause".
- - An rtuple is a tuple of the form ``(subj, filler, obj)``,
- where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words
- occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to
- circumvent locale variations in rendering utf-8 encoded strings.
- - A clause is an atom of the form ``relsym(subjsym, objsym)``,
- where the relation, subject and object have been canonicalized to single strings.
- """
- from __future__ import print_function
- # todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
- from collections import defaultdict
- import re
- from six.moves import html_entities
- # Dictionary that associates corpora with NE classes
- NE_CLASSES = {
- 'ieer': [
- 'LOCATION',
- 'ORGANIZATION',
- 'PERSON',
- 'DURATION',
- 'DATE',
- 'CARDINAL',
- 'PERCENT',
- 'MONEY',
- 'MEASURE',
- ],
- 'conll2002': ['LOC', 'PER', 'ORG'],
- 'ace': [
- 'LOCATION',
- 'ORGANIZATION',
- 'PERSON',
- 'DURATION',
- 'DATE',
- 'CARDINAL',
- 'PERCENT',
- 'MONEY',
- 'MEASURE',
- 'FACILITY',
- 'GPE',
- ],
- }
- # Allow abbreviated class labels
- short2long = dict(LOC='LOCATION', ORG='ORGANIZATION', PER='PERSON')
- long2short = dict(LOCATION='LOC', ORGANIZATION='ORG', PERSON='PER')
- def _expand(type):
- """
- Expand an NE class name.
- :type type: str
- :rtype: str
- """
- try:
- return short2long[type]
- except KeyError:
- return type
- def class_abbrev(type):
- """
- Abbreviate an NE class name.
- :type type: str
- :rtype: str
- """
- try:
- return long2short[type]
- except KeyError:
- return type
- def _join(lst, sep=' ', untag=False):
- """
- Join a list into a string, turning tags tuples into tag strings or just words.
- :param untag: if ``True``, omit the tag from tagged input strings.
- :type lst: list
- :rtype: str
- """
- try:
- return sep.join(lst)
- except TypeError:
- if untag:
- return sep.join(tup[0] for tup in lst)
- from nltk.tag import tuple2str
- return sep.join(tuple2str(tup) for tup in lst)
- def descape_entity(m, defs=html_entities.entitydefs):
- """
- Translate one entity to its ISO Latin value.
- Inspired by example from effbot.org
- """
- # s = 'mcglashan_&_sarrail'
- # l = ['mcglashan', '&', 'sarrail']
- # pattern = re.compile("&(\w+?);")
- # new = list2sym(l)
- # s = pattern.sub(descape_entity, s)
- # print s, new
- try:
- return defs[m.group(1)]
- except KeyError:
- return m.group(0) # use as is
- def list2sym(lst):
- """
- Convert a list of strings into a canonical symbol.
- :type lst: list
- :return: a Unicode string without whitespace
- :rtype: unicode
- """
- sym = _join(lst, '_', untag=True)
- sym = sym.lower()
- ENT = re.compile("&(\w+?);")
- sym = ENT.sub(descape_entity, sym)
- sym = sym.replace('.', '')
- return sym
- def tree2semi_rel(tree):
- """
- Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
- In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
- identifies pairs whose first member is a list (possibly empty) of terminal
- strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
- :param tree: a chunk tree
- :return: a list of pairs (list(str), ``Tree``)
- :rtype: list of tuple
- """
- from nltk.tree import Tree
- semi_rels = []
- semi_rel = [[], None]
- for dtr in tree:
- if not isinstance(dtr, Tree):
- semi_rel[0].append(dtr)
- else:
- # dtr is a Tree
- semi_rel[1] = dtr
- semi_rels.append(semi_rel)
- semi_rel = [[], None]
- return semi_rels
- def semi_rel2reldict(pairs, window=5, trace=False):
- """
- Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which
- stores information about the subject and object NEs plus the filler between them.
- Additionally, a left and right context of length =< window are captured (within
- a given input sentence).
- :param pairs: a pair of list(str) and ``Tree``, as generated by
- :param window: a threshold for the number of items to include in the left and right context
- :type window: int
- :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
- :rtype: list(defaultdict)
- """
- result = []
- while len(pairs) > 2:
- reldict = defaultdict(str)
- reldict['lcon'] = _join(pairs[0][0][-window:])
- reldict['subjclass'] = pairs[0][1].label()
- reldict['subjtext'] = _join(pairs[0][1].leaves())
- reldict['subjsym'] = list2sym(pairs[0][1].leaves())
- reldict['filler'] = _join(pairs[1][0])
- reldict['untagged_filler'] = _join(pairs[1][0], untag=True)
- reldict['objclass'] = pairs[1][1].label()
- reldict['objtext'] = _join(pairs[1][1].leaves())
- reldict['objsym'] = list2sym(pairs[1][1].leaves())
- reldict['rcon'] = _join(pairs[2][0][:window])
- if trace:
- print(
- "(%s(%s, %s)"
- % (
- reldict['untagged_filler'],
- reldict['subjclass'],
- reldict['objclass'],
- )
- )
- result.append(reldict)
- pairs = pairs[1:]
- return result
- def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
- """
- Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
- The parameters ``subjclass`` and ``objclass`` can be used to restrict the
- Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
- 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
- :param subjclass: the class of the subject Named Entity.
- :type subjclass: str
- :param objclass: the class of the object Named Entity.
- :type objclass: str
- :param doc: input document
- :type doc: ieer document or a list of chunk trees
- :param corpus: name of the corpus to take as input; possible values are
- 'ieer' and 'conll2002'
- :type corpus: str
- :param pattern: a regular expression for filtering the fillers of
- retrieved triples.
- :type pattern: SRE_Pattern
- :param window: filters out fillers which exceed this threshold
- :type window: int
- :return: see ``mk_reldicts``
- :rtype: list(defaultdict)
- """
- if subjclass and subjclass not in NE_CLASSES[corpus]:
- if _expand(subjclass) in NE_CLASSES[corpus]:
- subjclass = _expand(subjclass)
- else:
- raise ValueError(
- "your value for the subject type has not been recognized: %s"
- % subjclass
- )
- if objclass and objclass not in NE_CLASSES[corpus]:
- if _expand(objclass) in NE_CLASSES[corpus]:
- objclass = _expand(objclass)
- else:
- raise ValueError(
- "your value for the object type has not been recognized: %s" % objclass
- )
- if corpus == 'ace' or corpus == 'conll2002':
- pairs = tree2semi_rel(doc)
- elif corpus == 'ieer':
- pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
- else:
- raise ValueError("corpus type not recognized")
- reldicts = semi_rel2reldict(pairs)
- relfilter = lambda x: (
- x['subjclass'] == subjclass
- and len(x['filler'].split()) <= window
- and pattern.match(x['filler'])
- and x['objclass'] == objclass
- )
- return list(filter(relfilter, reldicts))
- def rtuple(reldict, lcon=False, rcon=False):
- """
- Pretty print the reldict as an rtuple.
- :param reldict: a relation dictionary
- :type reldict: defaultdict
- """
- items = [
- class_abbrev(reldict['subjclass']),
- reldict['subjtext'],
- reldict['filler'],
- class_abbrev(reldict['objclass']),
- reldict['objtext'],
- ]
- format = '[%s: %r] %r [%s: %r]'
- if lcon:
- items = [reldict['lcon']] + items
- format = '...%r)' + format
- if rcon:
- items.append(reldict['rcon'])
- format = format + '(%r...'
- printargs = tuple(items)
- return format % printargs
- def clause(reldict, relsym):
- """
- Print the relation in clausal form.
- :param reldict: a relation dictionary
- :type reldict: defaultdict
- :param relsym: a label for the relation
- :type relsym: str
- """
- items = (relsym, reldict['subjsym'], reldict['objsym'])
- return "%s(%r, %r)" % items
- #######################################################
- # Demos of relation extraction with regular expressions
- #######################################################
- ############################################
- # Example of in(ORG, LOC)
- ############################################
- def in_demo(trace=0, sql=True):
- """
- Select pairs of organizations and locations whose mentions occur with an
- intervening occurrence of the preposition "in".
- If the sql parameter is set to True, then the entity pairs are loaded into
- an in-memory database, and subsequently pulled out using an SQL "SELECT"
- query.
- """
- from nltk.corpus import ieer
- if sql:
- try:
- import sqlite3
- connection = sqlite3.connect(":memory:")
- connection.text_factory = sqlite3.OptimizedUnicode
- cur = connection.cursor()
- cur.execute(
- """create table Locations
- (OrgName text, LocationName text, DocID text)"""
- )
- except ImportError:
- import warnings
- warnings.warn("Cannot import sqlite; sql flag will be ignored.")
- IN = re.compile(r'.*\bin\b(?!\b.+ing)')
- print()
- print("IEER: in(ORG, LOC) -- just the clauses:")
- print("=" * 45)
- for file in ieer.fileids():
- for doc in ieer.parsed_docs(file):
- if trace:
- print(doc.docno)
- print("=" * 15)
- for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
- print(clause(rel, relsym='IN'))
- if sql:
- try:
- rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
- cur.execute(
- """insert into Locations
- values (?, ?, ?)""",
- rtuple,
- )
- connection.commit()
- except NameError:
- pass
- if sql:
- try:
- cur.execute(
- """select OrgName from Locations
- where LocationName = 'Atlanta'"""
- )
- print()
- print("Extract data from SQL table: ORGs in Atlanta")
- print("-" * 15)
- for row in cur:
- print(row)
- except NameError:
- pass
- ############################################
- # Example of has_role(PER, LOC)
- ############################################
- def roles_demo(trace=0):
- from nltk.corpus import ieer
- roles = """
- (.*( # assorted roles
- analyst|
- chair(wo)?man|
- commissioner|
- counsel|
- director|
- economist|
- editor|
- executive|
- foreman|
- governor|
- head|
- lawyer|
- leader|
- librarian).*)|
- manager|
- partner|
- president|
- producer|
- professor|
- researcher|
- spokes(wo)?man|
- writer|
- ,\sof\sthe?\s* # "X, of (the) Y"
- """
- ROLES = re.compile(roles, re.VERBOSE)
- print()
- print("IEER: has_role(PER, ORG) -- raw rtuples:")
- print("=" * 45)
- for file in ieer.fileids():
- for doc in ieer.parsed_docs(file):
- lcon = rcon = False
- if trace:
- print(doc.docno)
- print("=" * 15)
- lcon = rcon = True
- for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
- print(rtuple(rel, lcon=lcon, rcon=rcon))
- ##############################################
- ### Show what's in the IEER Headlines
- ##############################################
- def ieer_headlines():
- from nltk.corpus import ieer
- from nltk.tree import Tree
- print("IEER: First 20 Headlines")
- print("=" * 45)
- trees = [
- (doc.docno, doc.headline)
- for file in ieer.fileids()
- for doc in ieer.parsed_docs(file)
- ]
- for tree in trees[:20]:
- print()
- print("%s:\n%s" % tree)
- #############################################
- ## Dutch CONLL2002: take_on_role(PER, ORG
- #############################################
- def conllned(trace=1):
- """
- Find the copula+'van' relation ('of') in the Dutch tagged training corpus
- from CoNLL 2002.
- """
- from nltk.corpus import conll2002
- vnv = """
- (
- is/V| # 3rd sing present and
- was/V| # past forms of the verb zijn ('be')
- werd/V| # and also present
- wordt/V # past of worden ('become)
- )
- .* # followed by anything
- van/Prep # followed by van ('of')
- """
- VAN = re.compile(vnv, re.VERBOSE)
- print()
- print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
- print("=" * 45)
- for doc in conll2002.chunked_sents('ned.train'):
- lcon = rcon = False
- if trace:
- lcon = rcon = True
- for rel in extract_rels(
- 'PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10
- ):
- print(rtuple(rel, lcon=lcon, rcon=rcon))
- #############################################
- ## Spanish CONLL2002: (PER, ORG)
- #############################################
- def conllesp():
- from nltk.corpus import conll2002
- de = """
- .*
- (
- de/SP|
- del/SP
- )
- """
- DE = re.compile(de, re.VERBOSE)
- print()
- print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
- print("=" * 45)
- rels = [
- rel
- for doc in conll2002.chunked_sents('esp.train')
- for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern=DE)
- ]
- for r in rels[:10]:
- print(clause(r, relsym='DE'))
- print()
- def ne_chunked():
- print()
- print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
- print("=" * 45)
- ROLE = re.compile(
- r'.*(chairman|president|trader|scientist|economist|analyst|partner).*'
- )
- rels = []
- for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
- sent = nltk.ne_chunk(sent)
- rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
- for rel in rels:
- print('{0:<5}{1}'.format(i, rtuple(rel)))
- if __name__ == '__main__':
- import nltk
- from nltk.sem import relextract
- in_demo(trace=0)
- roles_demo(trace=0)
- conllned()
- conllesp()
- ieer_headlines()
- ne_chunked()
|