aboutsummaryrefslogtreecommitdiff
path: root/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'utils.py')
-rwxr-xr-xutils.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/utils.py b/utils.py
index 34f3423a..8787ed64 100755
--- a/utils.py
+++ b/utils.py
@@ -34,6 +34,7 @@ import sqlite3
import operator
import urlparse
import calendar
+import unicodedata
import htmlentitydefs
try:
@@ -157,6 +158,30 @@ def unescape(text):
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
+def not_combining(char):
+ return unicodedata.category(char) != 'Mn'
+
+def strip_accents(text, encoding):
+ """
+ Strip accents.
+
+ >>> print strip_accents("déjà", "utf-8")
+ deja
+ """
+ unicode_text= unicodedata.normalize('NFD', text.decode(encoding))
+ return filter(not_combining, unicode_text).encode(encoding)
+
+def normalize_filename(name):
+ """
+ Normalize a file name.
+ """
+ file_name = re.sub("[,'!?|&]", "", name)
+ file_name = re.sub("[\s.]", "_", file_name)
+ file_name = file_name.strip('_')
+ file_name = file_name.strip('.')
+ file_name = strip_accents(file_name, "utf-8")
+ return os.path.normpath(file_name)
+
def top_words(dic_articles, n=10, size=5):
"""
Return the n most frequent words in a list.
bgstack15