charsetprober.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. ######################## BEGIN LICENSE BLOCK ########################
  2. # The Original Code is Mozilla Universal charset detector code.
  3. #
  4. # The Initial Developer of the Original Code is
  5. # Netscape Communications Corporation.
  6. # Portions created by the Initial Developer are Copyright (C) 2001
  7. # the Initial Developer. All Rights Reserved.
  8. #
  9. # Contributor(s):
  10. # Mark Pilgrim - port to Python
  11. # Shy Shalom - original C code
  12. #
  13. # This library is free software; you can redistribute it and/or
  14. # modify it under the terms of the GNU Lesser General Public
  15. # License as published by the Free Software Foundation; either
  16. # version 2.1 of the License, or (at your option) any later version.
  17. #
  18. # This library is distributed in the hope that it will be useful,
  19. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. # Lesser General Public License for more details.
  22. #
  23. # You should have received a copy of the GNU Lesser General Public
  24. # License along with this library; if not, write to the Free Software
  25. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  26. # 02110-1301 USA
  27. ######################### END LICENSE BLOCK #########################
  28. import logging
  29. import re
  30. from .enums import ProbingState
  31. class CharSetProber(object):
  32. SHORTCUT_THRESHOLD = 0.95
  33. def __init__(self, lang_filter=None):
  34. self._state = None
  35. self.lang_filter = lang_filter
  36. self.logger = logging.getLogger(__name__)
  37. def reset(self):
  38. self._state = ProbingState.DETECTING
  39. @property
  40. def charset_name(self):
  41. return None
  42. def feed(self, buf):
  43. pass
  44. @property
  45. def state(self):
  46. return self._state
  47. def get_confidence(self):
  48. return 0.0
  49. @staticmethod
  50. def filter_high_byte_only(buf):
  51. buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
  52. return buf
  53. @staticmethod
  54. def filter_international_words(buf):
  55. """
  56. We define three types of bytes:
  57. alphabet: english alphabets [a-zA-Z]
  58. international: international characters [\x80-\xFF]
  59. marker: everything else [^a-zA-Z\x80-\xFF]
  60. The input buffer can be thought to contain a series of words delimited
  61. by markers. This function works to filter all words that contain at
  62. least one international character. All contiguous sequences of markers
  63. are replaced by a single space ascii character.
  64. This filter applies to all scripts which do not use English characters.
  65. """
  66. filtered = bytearray()
  67. # This regex expression filters out only words that have at-least one
  68. # international character. The word may include one marker character at
  69. # the end.
  70. words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
  71. buf)
  72. for word in words:
  73. filtered.extend(word[:-1])
  74. # If the last character in the word is a marker, replace it with a
  75. # space as markers shouldn't affect our analysis (they are used
  76. # similarly across all languages and may thus have similar
  77. # frequencies).
  78. last_char = word[-1:]
  79. if not last_char.isalpha() and last_char < b'\x80':
  80. last_char = b' '
  81. filtered.extend(last_char)
  82. return filtered
  83. @staticmethod
  84. def filter_with_english_letters(buf):
  85. """
  86. Returns a copy of ``buf`` that retains only the sequences of English
  87. alphabet and high byte characters that are not between <> characters.
  88. Also retains English alphabet and high byte characters immediately
  89. before occurrences of >.
  90. This filter can be applied to all scripts which contain both English
  91. characters and extended ASCII characters, but is currently only used by
  92. ``Latin1Prober``.
  93. """
  94. filtered = bytearray()
  95. in_tag = False
  96. prev = 0
  97. for curr in range(len(buf)):
  98. # Slice here to get bytes instead of an int with Python 3
  99. buf_char = buf[curr:curr + 1]
  100. # Check if we're coming out of or entering an HTML tag
  101. if buf_char == b'>':
  102. in_tag = False
  103. elif buf_char == b'<':
  104. in_tag = True
  105. # If current character is not extended-ASCII and not alphabetic...
  106. if buf_char < b'\x80' and not buf_char.isalpha():
  107. # ...and we're not in a tag
  108. if curr > prev and not in_tag:
  109. # Keep everything after last non-extended-ASCII,
  110. # non-alphabetic character
  111. filtered.extend(buf[prev:curr])
  112. # Output a space to delimit stretch we kept
  113. filtered.extend(b' ')
  114. prev = curr + 1
  115. # If we're not in a tag...
  116. if not in_tag:
  117. # Keep everything after last non-extended-ASCII, non-alphabetic
  118. # character
  119. filtered.extend(buf[prev:])
  120. return filtered