dimagalat/norm_text_v0.1.py

## norm_text_v0.1.py
import re
import string
import unicodedata
from typing import Optional


class TextNorm:

    def __init__(self):
        pass

    def normalize(self, s: str, remove_newlines: bool = True, compact_spaces: bool = True,
                  normalize_punctuation: bool = True, unicode_normalization: Optional[str] = None,
                  remove_html_tags_flag: bool = True) -> str:
        """
        Normalizes a text string with options to remove HTML tags, compact spaces,
        normalize punctuation, and apply Unicode normalization.

        :param s: String to be normalized.
        :param remove_newlines: If True, removes newline characters.
        :param compact_spaces: If True, compacts multiple spaces into one.
        :param normalize_punctuation: If True, normalizes punctuation.
        :param unicode_normalization: Unicode normalization form.
        :param remove_html_tags_flag: If True, removes HTML tags.
        :return: Normalized string or an empty string if the result is only punctuation.
        :raises ValueError: If the input is not a string.
        """

        if not isinstance(s, str):
            raise ValueError("Input must be a string")

        if unicode_normalization:
            s = unicodedata.normalize(unicode_normalization, s)

        if remove_html_tags_flag:
            s = self._remove_html_tags(s)

        s = self._normalize_spaces(s, compact_spaces)

        if normalize_punctuation:
            s = self._normalize_punctuation(s)

        s = s.strip()

        if self._contains_only_punctuation(s):
            return ""

        return s

    def _remove_html_tags(self, text: str) -> str:
        return re.sub(r'<[^>]+>', '', text)

    def _normalize_spaces(self, text: str, compact: bool) -> str:
        if compact:
            return re.sub(r'\s+', ' ', text)
        return text.replace("\n", " ")

    def _normalize_punctuation(self, text: str) -> str:
        text = re.sub(r"\.\s*\.", ".", text)               # Replace multiple periods with a single one
        text = re.sub(r",\s*,", ",", text)                 # Replace multiple commas with a single one
        text = text.replace(" .", ".").replace(" ,", ",")  # Remove space before periods and commas
        return text

    def _contains_only_punctuation(self, text: str) -> bool:
        return all(char in string.punctuation for char in text)


# Usage example:
# normalizer = TextNorm()
# normalized_text = normalizer.normalize("Some text")
	import re
	import string
	import unicodedata
	from typing import Optional


	class TextNorm:

	def __init__(self):
	pass

	def normalize(self, s: str, remove_newlines: bool = True, compact_spaces: bool = True,
	normalize_punctuation: bool = True, unicode_normalization: Optional[str] = None,
	remove_html_tags_flag: bool = True) -> str:
	"""
	Normalizes a text string with options to remove HTML tags, compact spaces,
	normalize punctuation, and apply Unicode normalization.

	:param s: String to be normalized.
	:param remove_newlines: If True, removes newline characters.
	:param compact_spaces: If True, compacts multiple spaces into one.
	:param normalize_punctuation: If True, normalizes punctuation.
	:param unicode_normalization: Unicode normalization form.
	:param remove_html_tags_flag: If True, removes HTML tags.
	:return: Normalized string or an empty string if the result is only punctuation.
	:raises ValueError: If the input is not a string.
	"""

	if not isinstance(s, str):
	raise ValueError("Input must be a string")

	if unicode_normalization:
	s = unicodedata.normalize(unicode_normalization, s)

	if remove_html_tags_flag:
	s = self._remove_html_tags(s)

	s = self._normalize_spaces(s, compact_spaces)

	if normalize_punctuation:
	s = self._normalize_punctuation(s)

	s = s.strip()

	if self._contains_only_punctuation(s):
	return ""

	return s

	def _remove_html_tags(self, text: str) -> str:
	return re.sub(r'<[^>]+>', '', text)

	def _normalize_spaces(self, text: str, compact: bool) -> str:
	if compact:
	return re.sub(r'\s+', ' ', text)
	return text.replace("\n", " ")

	def _normalize_punctuation(self, text: str) -> str:
	text = re.sub(r"\.\s*\.", ".", text) # Replace multiple periods with a single one
	text = re.sub(r",\s*,", ",", text) # Replace multiple commas with a single one
	text = text.replace(" .", ".").replace(" ,", ",") # Remove space before periods and commas
	return text

	def _contains_only_punctuation(self, text: str) -> bool:
	return all(char in string.punctuation for char in text)


	# Usage example:
	# normalizer = TextNorm()
	# normalized_text = normalizer.normalize("Some text")