nemo_curator.stages.text.utils.constants

View as Markdown

Module Contents

Data

bullet_list

common_english_words

ellipsis_marks

end_marks

policy_substrings

regex_alpha

regex_alphanum

regex_digit

regex_hash

regex_paren

regex_url

white_space_list

API

nemo_curator.stages.text.utils.constants.bullet_list = {'•', '‣', '⁃', '⁌', '⁍', '∙', '○', '●', '◘', '◦', '⦾', '⦿'}
nemo_curator.stages.text.utils.constants.common_english_words = {'the', 'be', 'to', 'of', 'and', 'that', 'have', 'with'}
nemo_curator.stages.text.utils.constants.ellipsis_marks = {'...', '[...]', '…', '(...)', '[…]', '-»', 'read more..', 'read more'}
nemo_curator.stages.text.utils.constants.end_marks = ('.', '?', '!', '"', "'")
nemo_curator.stages.text.utils.constants.policy_substrings = ['terms of use', 'privacy policy', 'cookie policy', 'uses cookies', 'privacy ove...
nemo_curator.stages.text.utils.constants.regex_alpha = regex.compile('[[:alpha:]]')
nemo_curator.stages.text.utils.constants.regex_alphanum = re.compile('[a-zA-Z0-9\n?!,.]')
nemo_curator.stages.text.utils.constants.regex_digit = regex.compile('[[:digit:]]')
nemo_curator.stages.text.utils.constants.regex_hash = re.compile('#+')
nemo_curator.stages.text.utils.constants.regex_paren = re.compile('{|}|⟨|⟩|\\[|\\]|\\(|\\)')
nemo_curator.stages.text.utils.constants.regex_url = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0...
nemo_curator.stages.text.utils.constants.white_space_list = ['\t', '\n', '\r', '\x08', ' ']