From e9db05902f73218912f302387747b9b45bdaf19e Mon Sep 17 00:00:00 2001 From: cedricbonhomme Date: Mon, 25 Oct 2010 14:53:20 +0200 Subject: Normalized file name. --- pyAggr3g470r.py | 20 +++++++++++--------- utils.py | 25 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index 1d1d9f89..2f3ed2a0 100755 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -1031,35 +1031,37 @@ class Root: def export(self, export_method): """ - Export articles stored in the SQLite database in text files. + Export articles stored in the SQLite database in text + (raw or HTML) files. """ for rss_feed_id in self.feeds.keys(): - folder = utils.path + "/var/export/" + self.feeds[rss_feed_id][3] - folder = folder.replace(' ', '_') + folder = utils.path + "/var/export/" + \ + utils.normalize_filename(self.feeds[rss_feed_id][3].strip().encode("utf-8")) try: os.makedirs(folder) except OSError: return self.error_page(utils.path + "var/export/"+" already exists.\nYou should delete this folder.") + for article in self.articles[rss_feed_id]: try: + name = article[1].strip().replace(' ', '_') + name = os.path.normpath(folder + "/" + name + ".html") + f = open(name, "w") + # Export all articles in HTML format if export_method == "export_HTML": - name = folder + "/" + article[1]+ ".html" - f = open(name.replace(' ', '_'), "w") content = htmlheader() content += '\n
\n' content += """

%s


""" % \ (article[3].encode('utf-8'), article[2].encode('utf-8')) content += article[4].encode('utf-8') - content += "
" - content += "
\n" + content += "\n
\n" content += htmlfooter # Export all articles in raw text elif export_method == "export_TXT": - name = folder + "/" + article[1] + ".txt" - f = open(name.replace(' ', '_'), "w") content = "Title: " + article[2].encode('utf-8') + "\n\n\n" content += utils.clear_string(article[4].encode('utf-8')) + f.write(content) except IOError: pass diff --git a/utils.py b/utils.py index 34f3423a..8787ed64 100755 --- a/utils.py +++ b/utils.py @@ -34,6 +34,7 @@ import sqlite3 import operator import urlparse import calendar +import unicodedata import htmlentitydefs try: @@ -157,6 +158,30 @@ def unescape(text): return text # leave as is return re.sub("&#?\w+;", fixup, text) +def not_combining(char): + return unicodedata.category(char) != 'Mn' + +def strip_accents(text, encoding): + """ + Strip accents. + + >>> print strip_accents("déjà", "utf-8") + deja + """ + unicode_text= unicodedata.normalize('NFD', text.decode(encoding)) + return filter(not_combining, unicode_text).encode(encoding) + +def normalize_filename(name): + """ + Normalize a file name. + """ + file_name = re.sub("[,'!?|&]", "", name) + file_name = re.sub("[\s.]", "_", file_name) + file_name = file_name.strip('_') + file_name = file_name.strip('.') + file_name = strip_accents(file_name, "utf-8") + return os.path.normpath(file_name) + def top_words(dic_articles, n=10, size=5): """ Return the n most frequent words in a list. -- cgit