123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308 |
- .. Copyright (C) 2001-2019 NLTK Project
- .. For license information, see LICENSE.TXT
- ===============================
- Unit test cases for ``toolbox``
- ===============================
- >>> from nltk import toolbox
- --------------------------
- ``toolbox.StandardFormat``
- --------------------------
- >>> f = toolbox.StandardFormat()
- ``toolbox.StandardFormat.open()``
- ---------------------------------
- >>> import os, tempfile
- >>> (fd, fname) = tempfile.mkstemp()
- >>> tf = os.fdopen(fd, "w")
- >>> _ = tf.write('\\lx a value\n\\lx another value\n')
- >>> tf.close()
- >>> f = toolbox.StandardFormat()
- >>> f.open(fname)
- >>> list(f.fields())
- [('lx', 'a value'), ('lx', 'another value')]
- >>> f.close()
- >>> os.unlink(fname)
- ``toolbox.StandardFormat.open_string()``
- ----------------------------------------
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value\n')
- >>> list(f.fields())
- [('lx', 'a value'), ('lx', 'another value')]
- >>> f.close()
- ``toolbox.StandardFormat.close()``
- ----------------------------------
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value\n')
- >>> list(f.fields())
- [('lx', 'a value'), ('lx', 'another value')]
- >>> f.close()
- ``toolbox.StandardFormat.line_num``
- ---------------------------------------
- ``StandardFormat.line_num`` contains the line number of the last line returned:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value\n\\lx a third value\n')
- >>> line_nums = []
- >>> for l in f.raw_fields():
- ... line_nums.append(f.line_num)
- >>> line_nums
- [1, 2, 3]
- ``StandardFormat.line_num`` contains the line number of the last line returned:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
- >>> line_nums = []
- >>> for l in f.raw_fields():
- ... line_nums.append(f.line_num)
- >>> line_nums
- [2, 5, 7]
- ``StandardFormat.line_num`` doesn't exist before openning or after closing
- a file or string:
- >>> f = toolbox.StandardFormat()
- >>> f.line_num
- Traceback (most recent call last):
- ...
- AttributeError: 'StandardFormat' object has no attribute 'line_num'
- >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
- >>> line_nums = []
- >>> for l in f.raw_fields():
- ... line_nums.append(f.line_num)
- >>> line_nums
- [2, 5, 7]
- >>> f.close()
- >>> f.line_num
- Traceback (most recent call last):
- ...
- AttributeError: 'StandardFormat' object has no attribute 'line_num'
- ``toolbox.StandardFormat.raw_fields()``
- ---------------------------------------
- ``raw_fields()`` returns an iterator over tuples of two strings representing the
- marker and its value. The marker is given without the backslash and the value
- without its trailing newline:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value\n')
- >>> list(f.raw_fields())
- [('lx', 'a value'), ('lx', 'another value')]
- an empty file returns nothing:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('')
- >>> list(f.raw_fields())
- []
- file with only a newline returns WHAT SHOULD IT RETURN???:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\n')
- >>> list(f.raw_fields())
- [(None, '')]
- file with only one field should be parsed ok:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx one value\n')
- >>> list(f.raw_fields())
- [('lx', 'one value')]
- file without a trailing newline should be parsed ok:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value')
- >>> list(f.raw_fields())
- [('lx', 'a value'), ('lx', 'another value')]
- trailing white space is preserved except for the final newline:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
- >>> list(f.raw_fields())
- [('lx', 'trailing space '), ('lx', 'trailing tab\t'), ('lx', 'extra newline\n')]
- line wrapping is preserved:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
- >>> list(f.raw_fields())
- [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
- file beginning with a multiline record should be parsed ok:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
- >>> list(f.raw_fields())
- [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
- file ending with a multiline record should be parsed ok:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lc a value\n\\lx another value\nmore of the value\nand still more\n')
- >>> list(f.raw_fields())
- [('lc', 'a value'), ('lx', 'another value\nmore of the value\nand still more')]
- file beginning with a BOM should be parsed ok:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\xef\xbb\xbf\\lx a value\n\\lx another value\n')
- >>> list(f.raw_fields())
- [('lx', 'a value'), ('lx', 'another value')]
- file beginning with two BOMs should ignore only the first one:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\xef\xbb\xbf\xef\xbb\xbf\\lx a value\n\\lx another value\n')
- >>> list(f.raw_fields())
- [(None, '\xef\xbb\xbf\\lx a value'), ('lx', 'another value')]
- should not ignore a BOM not at the beginning of the file:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\xef\xbb\xbf\\lx another value\n')
- >>> list(f.raw_fields())
- [('lx', 'a value\n\xef\xbb\xbf\\lx another value')]
- ``toolbox.StandardFormat.fields()``
- -----------------------------------
- trailing white space is not preserved:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
- >>> list(f.fields())
- [('lx', 'trailing space'), ('lx', 'trailing tab'), ('lx', 'extra newline')]
- multiline fields are unwrapped:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
- >>> list(f.fields())
- [('lx', 'a value more of the value and still more'), ('lc', 'another val')]
- markers
- -------
- A backslash in the first position on a new line indicates the start of a
- marker. The backslash is not part of the marker:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\mk a value\n')
- >>> list(f.fields())
- [('mk', 'a value')]
- If the backslash occurs later in the line it does not indicate the start
- of a marker:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\mk a value\n \\mk another one\n')
- >>> list(f.raw_fields())
- [('mk', 'a value\n \\mk another one')]
- There is no specific limit to the length of a marker:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\this_is_an_extremely_long_marker value\n')
- >>> list(f.fields())
- [('this_is_an_extremely_long_marker', 'value')]
- A marker can contain any non white space character:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\`~!@#$%^&*()_-=+[{]}\|,<.>/?;:"0123456789 value\n')
- >>> list(f.fields())
- [('`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789', 'value')]
- A marker is terminated by any white space character:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\mk a value\n\\mk\tanother one\n\\mk\rthird one\n\\mk\ffourth one')
- >>> list(f.fields())
- [('mk', 'a value'), ('mk', 'another one'), ('mk', 'third one'), ('mk', 'fourth one')]
- Consecutive whitespace characters (except newline) are treated the same as one:
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\mk \t\r\fa value\n')
- >>> list(f.fields())
- [('mk', 'a value')]
- -----------------------
- ``toolbox.ToolboxData``
- -----------------------
- >>> db = toolbox.ToolboxData()
- ``toolbox.ToolboxData.parse()``
- -------------------------------
- check that normal parsing works:
- >>> from xml.etree import ElementTree
- >>> td = toolbox.ToolboxData()
- >>> s = """\\_sh v3.0 400 Rotokas Dictionary
- ... \\_DateStampHasFourDigitYear
- ...
- ... \\lx kaa
- ... \\ps V.A
- ... \\ge gag
- ... \\gp nek i pas
- ...
- ... \\lx kaa
- ... \\ps V.B
- ... \\ge strangle
- ... \\gp pasim nek
- ... """
- >>> td.open_string(s)
- >>> tree = td.parse(key='lx')
- >>> tree.tag
- 'toolbox_data'
- >>> ElementTree.tostring(list(tree)[0]).decode('utf8')
- '<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
- >>> ElementTree.tostring(list(tree)[1]).decode('utf8')
- '<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
- >>> ElementTree.tostring(list(tree)[2]).decode('utf8')
- '<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
- check that guessing the key marker works:
- >>> from xml.etree import ElementTree
- >>> td = toolbox.ToolboxData()
- >>> s = """\\_sh v3.0 400 Rotokas Dictionary
- ... \\_DateStampHasFourDigitYear
- ...
- ... \\lx kaa
- ... \\ps V.A
- ... \\ge gag
- ... \\gp nek i pas
- ...
- ... \\lx kaa
- ... \\ps V.B
- ... \\ge strangle
- ... \\gp pasim nek
- ... """
- >>> td.open_string(s)
- >>> tree = td.parse()
- >>> ElementTree.tostring(list(tree)[0]).decode('utf8')
- '<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
- >>> ElementTree.tostring(list(tree)[1]).decode('utf8')
- '<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
- >>> ElementTree.tostring(list(tree)[2]).decode('utf8')
- '<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
- -----------------------
- ``toolbox`` functions
- -----------------------
- ``toolbox.to_sfm_string()``
- -------------------------------
|