summaryrefslogtreecommitdiff
path: root/venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/chardet/charsetprober.py
diff options
context:
space:
mode:
Diffstat (limited to 'venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/chardet/charsetprober.py')
-rw-r--r--venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/chardet/charsetprober.py145
1 files changed, 0 insertions, 145 deletions
diff --git a/venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/chardet/charsetprober.py b/venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/chardet/charsetprober.py
deleted file mode 100644
index eac4e59..0000000
--- a/venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/chardet/charsetprober.py
+++ /dev/null
@@ -1,145 +0,0 @@
-######################## BEGIN LICENSE BLOCK ########################
-# The Original Code is Mozilla Universal charset detector code.
-#
-# The Initial Developer of the Original Code is
-# Netscape Communications Corporation.
-# Portions created by the Initial Developer are Copyright (C) 2001
-# the Initial Developer. All Rights Reserved.
-#
-# Contributor(s):
-# Mark Pilgrim - port to Python
-# Shy Shalom - original C code
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
-# 02110-1301 USA
-######################### END LICENSE BLOCK #########################
-
-import logging
-import re
-
-from .enums import ProbingState
-
-
-class CharSetProber(object):
-
- SHORTCUT_THRESHOLD = 0.95
-
- def __init__(self, lang_filter=None):
- self._state = None
- self.lang_filter = lang_filter
- self.logger = logging.getLogger(__name__)
-
- def reset(self):
- self._state = ProbingState.DETECTING
-
- @property
- def charset_name(self):
- return None
-
- def feed(self, buf):
- pass
-
- @property
- def state(self):
- return self._state
-
- def get_confidence(self):
- return 0.0
-
- @staticmethod
- def filter_high_byte_only(buf):
- buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
- return buf
-
- @staticmethod
- def filter_international_words(buf):
- """
- We define three types of bytes:
- alphabet: english alphabets [a-zA-Z]
- international: international characters [\x80-\xFF]
- marker: everything else [^a-zA-Z\x80-\xFF]
-
- The input buffer can be thought to contain a series of words delimited
- by markers. This function works to filter all words that contain at
- least one international character. All contiguous sequences of markers
- are replaced by a single space ascii character.
-
- This filter applies to all scripts which do not use English characters.
- """
- filtered = bytearray()
-
- # This regex expression filters out only words that have at-least one
- # international character. The word may include one marker character at
- # the end.
- words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
- buf)
-
- for word in words:
- filtered.extend(word[:-1])
-
- # If the last character in the word is a marker, replace it with a
- # space as markers shouldn't affect our analysis (they are used
- # similarly across all languages and may thus have similar
- # frequencies).
- last_char = word[-1:]
- if not last_char.isalpha() and last_char < b'\x80':
- last_char = b' '
- filtered.extend(last_char)
-
- return filtered
-
- @staticmethod
- def filter_with_english_letters(buf):
- """
- Returns a copy of ``buf`` that retains only the sequences of English
- alphabet and high byte characters that are not between <> characters.
- Also retains English alphabet and high byte characters immediately
- before occurrences of >.
-
- This filter can be applied to all scripts which contain both English
- characters and extended ASCII characters, but is currently only used by
- ``Latin1Prober``.
- """
- filtered = bytearray()
- in_tag = False
- prev = 0
-
- for curr in range(len(buf)):
- # Slice here to get bytes instead of an int with Python 3
- buf_char = buf[curr:curr + 1]
- # Check if we're coming out of or entering an HTML tag
- if buf_char == b'>':
- in_tag = False
- elif buf_char == b'<':
- in_tag = True
-
- # If current character is not extended-ASCII and not alphabetic...
- if buf_char < b'\x80' and not buf_char.isalpha():
- # ...and we're not in a tag
- if curr > prev and not in_tag:
- # Keep everything after last non-extended-ASCII,
- # non-alphabetic character
- filtered.extend(buf[prev:curr])
- # Output a space to delimit stretch we kept
- filtered.extend(b' ')
- prev = curr + 1
-
- # If we're not in a tag...
- if not in_tag:
- # Keep everything after last non-extended-ASCII, non-alphabetic
- # character
- filtered.extend(buf[prev:])
-
- return filtered