Source code
Revision control
Copy as Markdown
Other Tools
from __future__ import annotations
import re
import unicodedata
from collections.abc import Iterable
from html.entities import name2codepoint
try:
import unidecode
except ImportError:
import text_unidecode as unidecode
__all__ = ['slugify', 'smart_truncate']
CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
DECIMAL_PATTERN = re.compile(r'&#(\d+);')
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
QUOTE_PATTERN = re.compile(r'[\']+')
DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
DEFAULT_SEPARATOR = '-'
def smart_truncate(
string: str,
max_length: int = 0,
word_boundary: bool = False,
separator: str = " ",
save_order: bool = False,
) -> str:
"""
Truncate a string.
:param string (str): string for modification
:param max_length (int): output string length
:param word_boundary (bool):
:param save_order (bool): if True then word order of output string is like input string
:param separator (str): separator between words
:return:
"""
string = string.strip(separator)
if not max_length:
return string
if len(string) < max_length:
return string
if not word_boundary:
return string[:max_length].strip(separator)
if separator not in string:
return string[:max_length]
truncated = ''
for word in string.split(separator):
if word:
next_len = len(truncated) + len(word)
if next_len < max_length:
truncated += '{}{}'.format(word, separator)
elif next_len == max_length:
truncated += '{}'.format(word)
break
else:
if save_order:
break
if not truncated: # pragma: no cover
truncated = string[:max_length]
return truncated.strip(separator)
def slugify(
text: str,
entities: bool = True,
decimal: bool = True,
hexadecimal: bool = True,
max_length: int = 0,
word_boundary: bool = False,
separator: str = DEFAULT_SEPARATOR,
save_order: bool = False,
stopwords: Iterable[str] = (),
regex_pattern: re.Pattern[str] | str | None = None,
lowercase: bool = True,
replacements: Iterable[Iterable[str]] = (),
allow_unicode: bool = False,
) -> str:
"""
Make a slug from the given text.
:param text (str): initial text
:param entities (bool): converts html entities to unicode
:param decimal (bool): converts html decimal to unicode
:param hexadecimal (bool): converts html hexadecimal to unicode
:param max_length (int): output string length
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
:param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
:param separator (str): separator between words
:param stopwords (iterable): words to discount
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:return (str):
"""
# user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
# ensure text is unicode
if not isinstance(text, str):
text = str(text, 'utf-8', 'ignore')
# replace quotes with dashes - pre-process
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
# normalize text, convert to unicode if required
if allow_unicode:
text = unicodedata.normalize('NFKC', text)
else:
text = unicodedata.normalize('NFKD', text)
text = unidecode.unidecode(text)
# ensure text is still in unicode
if not isinstance(text, str):
text = str(text, 'utf-8', 'ignore')
# character entity reference
if entities:
text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
# decimal character reference
if decimal:
try:
text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
except Exception:
pass
# hexadecimal character reference
if hexadecimal:
try:
text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
except Exception:
pass
# re normalize text
if allow_unicode:
text = unicodedata.normalize('NFKC', text)
else:
text = unicodedata.normalize('NFKD', text)
# make the text lowercase (optional)
if lowercase:
text = text.lower()
# remove generated quotes -- post-process
text = QUOTE_PATTERN.sub('', text)
# cleanup numbers
text = NUMBERS_PATTERN.sub('', text)
# replace all other unwanted characters
if allow_unicode:
pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
else:
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
# remove redundant
text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
# remove stopwords
if stopwords:
if lowercase:
stopwords_lower = [s.lower() for s in stopwords]
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
else:
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
text = DEFAULT_SEPARATOR.join(words)
# finalize user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
# smart truncate if requested
if max_length > 0:
text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
if separator != DEFAULT_SEPARATOR:
text = text.replace(DEFAULT_SEPARATOR, separator)
return text