diff options
-rwxr-xr-x | pyAggr3g470r.py | 20 | ||||
-rwxr-xr-x | utils.py | 25 |
2 files changed, 36 insertions, 9 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index 1d1d9f89..2f3ed2a0 100755 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -1031,35 +1031,37 @@ class Root: def export(self, export_method): """ - Export articles stored in the SQLite database in text files. + Export articles stored in the SQLite database in text + (raw or HTML) files. """ for rss_feed_id in self.feeds.keys(): - folder = utils.path + "/var/export/" + self.feeds[rss_feed_id][3] - folder = folder.replace(' ', '_') + folder = utils.path + "/var/export/" + \ + utils.normalize_filename(self.feeds[rss_feed_id][3].strip().encode("utf-8")) try: os.makedirs(folder) except OSError: return self.error_page(utils.path + "var/export/"+" already exists.\nYou should delete this folder.") + for article in self.articles[rss_feed_id]: try: + name = article[1].strip().replace(' ', '_') + name = os.path.normpath(folder + "/" + name + ".html") + f = open(name, "w") + # Export all articles in HTML format if export_method == "export_HTML": - name = folder + "/" + article[1]+ ".html" - f = open(name.replace(' ', '_'), "w") content = htmlheader() content += '\n<div style="width: 50%; overflow:hidden; text-align: justify; margin:0 auto">\n' content += """<h1><a href="%s">%s</a></h1><br />""" % \ (article[3].encode('utf-8'), article[2].encode('utf-8')) content += article[4].encode('utf-8') - content += "</div>" - content += "<hr />\n" + content += "</div>\n<hr />\n" content += htmlfooter # Export all articles in raw text elif export_method == "export_TXT": - name = folder + "/" + article[1] + ".txt" - f = open(name.replace(' ', '_'), "w") content = "Title: " + article[2].encode('utf-8') + "\n\n\n" content += utils.clear_string(article[4].encode('utf-8')) + f.write(content) except IOError: pass @@ -34,6 +34,7 @@ import sqlite3 import operator import urlparse import calendar +import unicodedata import htmlentitydefs try: @@ -157,6 +158,30 @@ def unescape(text): return text # leave as is return re.sub("&#?\w+;", fixup, text) +def not_combining(char): + return unicodedata.category(char) != 'Mn' + +def strip_accents(text, encoding): + """ + Strip accents. + + >>> print strip_accents("déjà", "utf-8") + deja + """ + unicode_text= unicodedata.normalize('NFD', text.decode(encoding)) + return filter(not_combining, unicode_text).encode(encoding) + +def normalize_filename(name): + """ + Normalize a file name. + """ + file_name = re.sub("[,'!?|&]", "", name) + file_name = re.sub("[\s.]", "_", file_name) + file_name = file_name.strip('_') + file_name = file_name.strip('.') + file_name = strip_accents(file_name, "utf-8") + return os.path.normpath(file_name) + def top_words(dic_articles, n=10, size=5): """ Return the n most frequent words in a list. |