Last active
November 16, 2023 18:07
-
-
Save dimagalat/49d133f57ecf144ed4cdc2d0c529d4ea to your computer and use it in GitHub Desktop.
A cleaning function suitable for deep learning applications
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
import unicodedata | |
from typing import Optional | |
class TextNorm: | |
def __init__(self): | |
pass | |
def normalize(self, s: str, remove_newlines: bool = True, compact_spaces: bool = True, | |
normalize_punctuation: bool = True, unicode_normalization: Optional[str] = None, | |
remove_html_tags_flag: bool = True) -> str: | |
""" | |
Normalizes a text string with options to remove HTML tags, compact spaces, | |
normalize punctuation, and apply Unicode normalization. | |
:param s: String to be normalized. | |
:param remove_newlines: If True, removes newline characters. | |
:param compact_spaces: If True, compacts multiple spaces into one. | |
:param normalize_punctuation: If True, normalizes punctuation. | |
:param unicode_normalization: Unicode normalization form. | |
:param remove_html_tags_flag: If True, removes HTML tags. | |
:return: Normalized string or an empty string if the result is only punctuation. | |
:raises ValueError: If the input is not a string. | |
""" | |
if not isinstance(s, str): | |
raise ValueError("Input must be a string") | |
if unicode_normalization: | |
s = unicodedata.normalize(unicode_normalization, s) | |
if remove_html_tags_flag: | |
s = self._remove_html_tags(s) | |
s = self._normalize_spaces(s, compact_spaces) | |
if normalize_punctuation: | |
s = self._normalize_punctuation(s) | |
s = s.strip() | |
if self._contains_only_punctuation(s): | |
return "" | |
return s | |
def _remove_html_tags(self, text: str) -> str: | |
return re.sub(r'<[^>]+>', '', text) | |
def _normalize_spaces(self, text: str, compact: bool) -> str: | |
if compact: | |
return re.sub(r'\s+', ' ', text) | |
return text.replace("\n", " ") | |
def _normalize_punctuation(self, text: str) -> str: | |
text = re.sub(r"\.\s*\.", ".", text) # Replace multiple periods with a single one | |
text = re.sub(r",\s*,", ",", text) # Replace multiple commas with a single one | |
text = text.replace(" .", ".").replace(" ,", ",") # Remove space before periods and commas | |
return text | |
def _contains_only_punctuation(self, text: str) -> bool: | |
return all(char in string.punctuation for char in text) | |
# Usage example: | |
# normalizer = TextNorm() | |
# normalized_text = normalizer.normalize("Some text") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment