chardetect.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. #!/usr/bin/env python
  2. """
  3. Script which takes one or more file paths and reports on their detected
  4. encodings
  5. Example::
  6. % chardetect somefile someotherfile
  7. somefile: windows-1252 with confidence 0.5
  8. someotherfile: ascii with confidence 1.0
  9. If no paths are provided, it takes its input from stdin.
  10. """
  11. from __future__ import absolute_import, print_function, unicode_literals
  12. import argparse
  13. import sys
  14. from chardet import __version__
  15. from chardet.compat import PY2
  16. from chardet.universaldetector import UniversalDetector
  17. def description_of(lines, name='stdin'):
  18. """
  19. Return a string describing the probable encoding of a file or
  20. list of strings.
  21. :param lines: The lines to get the encoding of.
  22. :type lines: Iterable of bytes
  23. :param name: Name of file or collection of lines
  24. :type name: str
  25. """
  26. u = UniversalDetector()
  27. for line in lines:
  28. line = bytearray(line)
  29. u.feed(line)
  30. # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
  31. if u.done:
  32. break
  33. u.close()
  34. result = u.result
  35. if PY2:
  36. name = name.decode(sys.getfilesystemencoding(), 'ignore')
  37. if result['encoding']:
  38. return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
  39. result['confidence'])
  40. else:
  41. return '{0}: no result'.format(name)
  42. def main(argv=None):
  43. """
  44. Handles command line arguments and gets things started.
  45. :param argv: List of arguments, as if specified on the command-line.
  46. If None, ``sys.argv[1:]`` is used instead.
  47. :type argv: list of str
  48. """
  49. # Get command line arguments
  50. parser = argparse.ArgumentParser(
  51. description="Takes one or more file paths and reports their detected \
  52. encodings")
  53. parser.add_argument('input',
  54. help='File whose encoding we would like to determine. \
  55. (default: stdin)',
  56. type=argparse.FileType('rb'), nargs='*',
  57. default=[sys.stdin if PY2 else sys.stdin.buffer])
  58. parser.add_argument('--version', action='version',
  59. version='%(prog)s {0}'.format(__version__))
  60. args = parser.parse_args(argv)
  61. for f in args.input:
  62. if f.isatty():
  63. print("You are running chardetect interactively. Press " +
  64. "CTRL-D twice at the start of a blank line to signal the " +
  65. "end of your input. If you want help, run chardetect " +
  66. "--help\n", file=sys.stderr)
  67. print(description_of(f, f.name))
  68. if __name__ == '__main__':
  69. main()