aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xpyAggr3g470r.py20
-rwxr-xr-xutils.py25
2 files changed, 36 insertions, 9 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py
index 1d1d9f89..2f3ed2a0 100755
--- a/pyAggr3g470r.py
+++ b/pyAggr3g470r.py
@@ -1031,35 +1031,37 @@ class Root:
def export(self, export_method):
"""
- Export articles stored in the SQLite database in text files.
+ Export articles stored in the SQLite database in text
+ (raw or HTML) files.
"""
for rss_feed_id in self.feeds.keys():
- folder = utils.path + "/var/export/" + self.feeds[rss_feed_id][3]
- folder = folder.replace(' ', '_')
+ folder = utils.path + "/var/export/" + \
+ utils.normalize_filename(self.feeds[rss_feed_id][3].strip().encode("utf-8"))
try:
os.makedirs(folder)
except OSError:
return self.error_page(utils.path + "var/export/"+" already exists.\nYou should delete this folder.")
+
for article in self.articles[rss_feed_id]:
try:
+ name = article[1].strip().replace(' ', '_')
+ name = os.path.normpath(folder + "/" + name + ".html")
+ f = open(name, "w")
+
# Export all articles in HTML format
if export_method == "export_HTML":
- name = folder + "/" + article[1]+ ".html"
- f = open(name.replace(' ', '_'), "w")
content = htmlheader()
content += '\n<div style="width: 50%; overflow:hidden; text-align: justify; margin:0 auto">\n'
content += """<h1><a href="%s">%s</a></h1><br />""" % \
(article[3].encode('utf-8'), article[2].encode('utf-8'))
content += article[4].encode('utf-8')
- content += "</div>"
- content += "<hr />\n"
+ content += "</div>\n<hr />\n"
content += htmlfooter
# Export all articles in raw text
elif export_method == "export_TXT":
- name = folder + "/" + article[1] + ".txt"
- f = open(name.replace(' ', '_'), "w")
content = "Title: " + article[2].encode('utf-8') + "\n\n\n"
content += utils.clear_string(article[4].encode('utf-8'))
+
f.write(content)
except IOError:
pass
diff --git a/utils.py b/utils.py
index 34f3423a..8787ed64 100755
--- a/utils.py
+++ b/utils.py
@@ -34,6 +34,7 @@ import sqlite3
import operator
import urlparse
import calendar
+import unicodedata
import htmlentitydefs
try:
@@ -157,6 +158,30 @@ def unescape(text):
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
+def not_combining(char):
+ return unicodedata.category(char) != 'Mn'
+
+def strip_accents(text, encoding):
+ """
+ Strip accents.
+
+ >>> print strip_accents("déjà", "utf-8")
+ deja
+ """
+ unicode_text= unicodedata.normalize('NFD', text.decode(encoding))
+ return filter(not_combining, unicode_text).encode(encoding)
+
+def normalize_filename(name):
+ """
+ Normalize a file name.
+ """
+ file_name = re.sub("[,'!?|&]", "", name)
+ file_name = re.sub("[\s.]", "_", file_name)
+ file_name = file_name.strip('_')
+ file_name = file_name.strip('.')
+ file_name = strip_accents(file_name, "utf-8")
+ return os.path.normpath(file_name)
+
def top_words(dic_articles, n=10, size=5):
"""
Return the n most frequent words in a list.
bgstack15