aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcedricbonhomme <devnull@localhost>2010-02-28 18:06:10 +0100
committercedricbonhomme <devnull@localhost>2010-02-28 18:06:10 +0100
commit625c7a044c3c944683762b7bb75dec9dd5b53fb9 (patch)
treea8f448682cb05631b3f3e8934220a5be61351814
parentMinor enhancements of the code. (diff)
downloadnewspipe-625c7a044c3c944683762b7bb75dec9dd5b53fb9.tar.gz
newspipe-625c7a044c3c944683762b7bb75dec9dd5b53fb9.tar.bz2
newspipe-625c7a044c3c944683762b7bb75dec9dd5b53fb9.zip
pyAggr3g470r can now detect the language of an article thanks to the module oice.langdet.
-rw-r--r--pyAggr3g470r.py55
-rw-r--r--utils.py30
2 files changed, 79 insertions, 6 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py
index c197033f..e1251664 100644
--- a/pyAggr3g470r.py
+++ b/pyAggr3g470r.py
@@ -131,6 +131,7 @@ class Root:
def management(self):
"""
+ Management of articles.
"""
self.articles, self.feeds = utils.load_feed()
html = htmlheader
@@ -174,7 +175,27 @@ class Root:
html += "</ol>\n</td><td>"
utils.create_histogram(top_words)
html += """<img src="/var/histogram.png" /></td></tr></table>"""
- html += "<hr />\n"
+
+ nb_french = 0
+ nb_english = 0
+ nb_other = 0
+ for rss_feed_id in self.articles.keys():
+ for article in self.articles[rss_feed_id]:
+ if article[6] == 'french':
+ nb_french += 1
+ elif article[6] == 'english':
+ nb_english += 1
+ else:
+ nb_other +=1
+
+ html += "<h1>Languages</h1>\n"
+ html += "<ul>\n"
+ for language in ['english', 'french', 'other']:
+ html += """<li>%s articles in <a href="/language/%s">%s</a></li>\n""" % \
+ (locals()["nb_"+language],
+ language, language)
+ html += "</ul>\n"
+ html += "<hr />\n"
html += htmlfooter
return html
@@ -274,7 +295,9 @@ class Root:
html += description
else:
html += "No description available."
- html += """<hr />\n<a href="%s">Complete story</a>\n<br />\n""" % \
+ html += "<hr />\n"
+ html += """This article is written in %s.""" % (article[6],)
+ html += """<br /><a href="%s">Complete story</a>\n<br />\n""" % \
(article[3].encode('utf-8'),)
# Share this article:
# on delicious
@@ -310,7 +333,7 @@ class Root:
def all_articles(self, feed_id):
"""
- Display all articles of a feed ('feed_title').
+ Display all articles of a feed.
"""
html = htmlheader
html += htmlnav
@@ -357,7 +380,7 @@ class Root:
def unread(self, feed_id):
"""
- Display all unread articles of a feed ('feed_title').
+ Display all unread articles of a feed.
"""
html = htmlheader
html += htmlnav
@@ -393,6 +416,30 @@ class Root:
unread.exposed = True
+ def language(self, lang):
+ """
+ """
+ html = htmlheader
+ html += htmlnav
+ html += """</div> <div class="left inner">"""
+
+ html += """<h1>Article(s) written in %s</h1>""" % (lang,)
+
+ for rss_feed_id in self.articles.keys():
+ for article in self.articles[rss_feed_id]:
+ if article[6] == lang:
+ html += article[1].encode('utf-8') + \
+ """ - <a href="/description/%s" rel="noreferrer" target="_blank">%s</a>
+ from <i><a href="%s">%s</a></i><br />\n""" % \
+ (article[0].encode('utf-8'), article[2].encode('utf-8'), \
+ self.feeds[rss_feed_id][5].encode('utf-8'), \
+ self.feeds[rss_feed_id][3].encode('utf-8'))
+
+ html += "<hr />\n"
+ html += htmlfooter
+ return html
+
+ language.exposed = True
def mark_as_read(self, target):
"""
diff --git a/utils.py b/utils.py
index b7434636..b8f65089 100644
--- a/utils.py
+++ b/utils.py
@@ -16,6 +16,26 @@ from datetime import datetime
from string import punctuation
from collections import defaultdict
+from StringIO import StringIO
+
+from oice.langdet import langdet
+from oice.langdet import streams
+from oice.langdet import languages
+
+def detect_language(text):
+ """
+ """
+ try:
+ text_stream = streams.Stream(StringIO(text))
+ lang = langdet.LanguageDetector.detect(text_stream)
+ except:
+ return 'other'
+ if lang == languages.french:
+ return 'french'.encode('utf-8')
+ elif lang == languages.english:
+ return 'english'.encode('utf-8')
+ else:
+ lang == 'other'
def remove_html_tags(data):
"""
@@ -119,7 +139,8 @@ def load_feed():
pass
# articles[feed_id] = (article_id, article_date, article_title,
- # article_link, article_description, article_readed)
+ # article_link, article_description, article_readed,
+ # article_language)
# feeds[feed_id] = (nb_article, nb_article_unreaded, feed_image,
# feed_title, feed_link, feed_site_link)
articles, feeds = {}, {}
@@ -137,8 +158,13 @@ def load_feed():
sha256_hash.update(article[2].encode('utf-8'))
article_id = sha256_hash.hexdigest()
+ if article[3] != "":
+ language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8'))
+ else:
+ language = "other"
+
article_list = [article_id, article[0], article[1], \
- article[2], article[3], article[4]]
+ article[2], article[3], article[4], language]
if feed_id not in articles:
articles[feed_id] = [article_list]
bgstack15