diff options
author | cedricbonhomme <devnull@localhost> | 2010-10-25 14:53:20 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-10-25 14:53:20 +0200 |
commit | e9db05902f73218912f302387747b9b45bdaf19e (patch) | |
tree | 265710051e959a2d7bb9379bbfd9cc834a91f190 /utils.py | |
parent | Added comments. Cleaner code. (diff) | |
download | newspipe-e9db05902f73218912f302387747b9b45bdaf19e.tar.gz newspipe-e9db05902f73218912f302387747b9b45bdaf19e.tar.bz2 newspipe-e9db05902f73218912f302387747b9b45bdaf19e.zip |
Normalized file name.
Diffstat (limited to 'utils.py')
-rwxr-xr-x | utils.py | 25 |
1 files changed, 25 insertions, 0 deletions
@@ -34,6 +34,7 @@ import sqlite3 import operator import urlparse import calendar +import unicodedata import htmlentitydefs try: @@ -157,6 +158,30 @@ def unescape(text): return text # leave as is return re.sub("&#?\w+;", fixup, text) +def not_combining(char): + return unicodedata.category(char) != 'Mn' + +def strip_accents(text, encoding): + """ + Strip accents. + + >>> print strip_accents("déjà", "utf-8") + deja + """ + unicode_text= unicodedata.normalize('NFD', text.decode(encoding)) + return filter(not_combining, unicode_text).encode(encoding) + +def normalize_filename(name): + """ + Normalize a file name. + """ + file_name = re.sub("[,'!?|&]", "", name) + file_name = re.sub("[\s.]", "_", file_name) + file_name = file_name.strip('_') + file_name = file_name.strip('.') + file_name = strip_accents(file_name, "utf-8") + return os.path.normpath(file_name) + def top_words(dic_articles, n=10, size=5): """ Return the n most frequent words in a list. |