Skip to content

Instantly share code, notes, and snippets.

@dimagalat
Last active November 16, 2023 18:07
Show Gist options
  • Save dimagalat/49d133f57ecf144ed4cdc2d0c529d4ea to your computer and use it in GitHub Desktop.
Save dimagalat/49d133f57ecf144ed4cdc2d0c529d4ea to your computer and use it in GitHub Desktop.
A cleaning function suitable for deep learning applications
import re
import string
import unicodedata
from typing import Optional
class TextNorm:
def __init__(self):
pass
def normalize(self, s: str, remove_newlines: bool = True, compact_spaces: bool = True,
normalize_punctuation: bool = True, unicode_normalization: Optional[str] = None,
remove_html_tags_flag: bool = True) -> str:
"""
Normalizes a text string with options to remove HTML tags, compact spaces,
normalize punctuation, and apply Unicode normalization.
:param s: String to be normalized.
:param remove_newlines: If True, removes newline characters.
:param compact_spaces: If True, compacts multiple spaces into one.
:param normalize_punctuation: If True, normalizes punctuation.
:param unicode_normalization: Unicode normalization form.
:param remove_html_tags_flag: If True, removes HTML tags.
:return: Normalized string or an empty string if the result is only punctuation.
:raises ValueError: If the input is not a string.
"""
if not isinstance(s, str):
raise ValueError("Input must be a string")
if unicode_normalization:
s = unicodedata.normalize(unicode_normalization, s)
if remove_html_tags_flag:
s = self._remove_html_tags(s)
s = self._normalize_spaces(s, compact_spaces)
if normalize_punctuation:
s = self._normalize_punctuation(s)
s = s.strip()
if self._contains_only_punctuation(s):
return ""
return s
def _remove_html_tags(self, text: str) -> str:
return re.sub(r'<[^>]+>', '', text)
def _normalize_spaces(self, text: str, compact: bool) -> str:
if compact:
return re.sub(r'\s+', ' ', text)
return text.replace("\n", " ")
def _normalize_punctuation(self, text: str) -> str:
text = re.sub(r"\.\s*\.", ".", text) # Replace multiple periods with a single one
text = re.sub(r",\s*,", ",", text) # Replace multiple commas with a single one
text = text.replace(" .", ".").replace(" ,", ",") # Remove space before periods and commas
return text
def _contains_only_punctuation(self, text: str) -> bool:
return all(char in string.punctuation for char in text)
# Usage example:
# normalizer = TextNorm()
# normalized_text = normalizer.normalize("Some text")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment