Normalized file name.

author: cedricbonhomme <devnull@localhost> 2010-10-25 14:53:20 +0200
committer: cedricbonhomme <devnull@localhost> 2010-10-25 14:53:20 +0200
commit: e9db05902f73218912f302387747b9b45bdaf19e (patch)
tree: 265710051e959a2d7bb9379bbfd9cc834a91f190 /utils.py
parent: Added comments. Cleaner code. (diff)
download: newspipe-e9db05902f73218912f302387747b9b45bdaf19e.tar.gz
newspipe-e9db05902f73218912f302387747b9b45bdaf19e.tar.bz2
newspipe-e9db05902f73218912f302387747b9b45bdaf19e.zip
1 files changed, 25 insertions, 0 deletions
diff --git a/utils.py b/utils.py
index 34f3423a..8787ed64 100755
--- a/utils.py
+++ b/utils.py
@@ -34,6 +34,7 @@ import sqlite3
 import operator
 import urlparse
 import calendar
+import unicodedata
 import htmlentitydefs
 
 try:
@@ -157,6 +158,30 @@ def unescape(text):
         return text # leave as is
     return re.sub("&#?\w+;", fixup, text)
 
+def not_combining(char):
+    return unicodedata.category(char) != 'Mn'
+
+def strip_accents(text, encoding):
+    """
+    Strip accents.
+
+    >>> print strip_accents("déjà", "utf-8")
+    deja
+    """
+    unicode_text= unicodedata.normalize('NFD', text.decode(encoding))
+    return filter(not_combining, unicode_text).encode(encoding)
+
+def normalize_filename(name):
+    """
+    Normalize a file name.
+    """
+    file_name = re.sub("[,'!?|&]", "", name)
+    file_name = re.sub("[\s.]", "_", file_name)
+    file_name = file_name.strip('_')
+    file_name = file_name.strip('.')
+    file_name = strip_accents(file_name, "utf-8")
+    return os.path.normpath(file_name)
+
 def top_words(dic_articles, n=10, size=5):
     """
     Return the n most frequent words in a list.
author	cedricbonhomme <devnull@localhost>	2010-10-25 14:53:20 +0200
committer	cedricbonhomme <devnull@localhost>	2010-10-25 14:53:20 +0200
commit	e9db05902f73218912f302387747b9b45bdaf19e (patch)
tree	265710051e959a2d7bb9379bbfd9cc834a91f190 /utils.py
parent	Added comments. Cleaner code. (diff)
download	newspipe-e9db05902f73218912f302387747b9b45bdaf19e.tar.gz newspipe-e9db05902f73218912f302387747b9b45bdaf19e.tar.bz2 newspipe-e9db05902f73218912f302387747b9b45bdaf19e.zip