From 625c7a044c3c944683762b7bb75dec9dd5b53fb9 Mon Sep 17 00:00:00 2001 From: cedricbonhomme Date: Sun, 28 Feb 2010 18:06:10 +0100 Subject: pyAggr3g470r can now detect the language of an article thanks to the module oice.langdet. --- pyAggr3g470r.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++---- utils.py | 30 ++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index c197033f..e1251664 100644 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -131,6 +131,7 @@ class Root: def management(self): """ + Management of articles. """ self.articles, self.feeds = utils.load_feed() html = htmlheader @@ -174,7 +175,27 @@ class Root: html += "\n" utils.create_histogram(top_words) html += """""" - html += "
\n" + + nb_french = 0 + nb_english = 0 + nb_other = 0 + for rss_feed_id in self.articles.keys(): + for article in self.articles[rss_feed_id]: + if article[6] == 'french': + nb_french += 1 + elif article[6] == 'english': + nb_english += 1 + else: + nb_other +=1 + + html += "

Languages

\n" + html += "\n" + html += "
\n" html += htmlfooter return html @@ -274,7 +295,9 @@ class Root: html += description else: html += "No description available." - html += """
\nComplete story\n
\n""" % \ + html += "
\n" + html += """This article is written in %s.""" % (article[6],) + html += """
Complete story\n
\n""" % \ (article[3].encode('utf-8'),) # Share this article: # on delicious @@ -310,7 +333,7 @@ class Root: def all_articles(self, feed_id): """ - Display all articles of a feed ('feed_title'). + Display all articles of a feed. """ html = htmlheader html += htmlnav @@ -357,7 +380,7 @@ class Root: def unread(self, feed_id): """ - Display all unread articles of a feed ('feed_title'). + Display all unread articles of a feed. """ html = htmlheader html += htmlnav @@ -393,6 +416,30 @@ class Root: unread.exposed = True + def language(self, lang): + """ + """ + html = htmlheader + html += htmlnav + html += """
""" + + html += """

Article(s) written in %s

""" % (lang,) + + for rss_feed_id in self.articles.keys(): + for article in self.articles[rss_feed_id]: + if article[6] == lang: + html += article[1].encode('utf-8') + \ + """ - %s + from %s
\n""" % \ + (article[0].encode('utf-8'), article[2].encode('utf-8'), \ + self.feeds[rss_feed_id][5].encode('utf-8'), \ + self.feeds[rss_feed_id][3].encode('utf-8')) + + html += "
\n" + html += htmlfooter + return html + + language.exposed = True def mark_as_read(self, target): """ diff --git a/utils.py b/utils.py index b7434636..b8f65089 100644 --- a/utils.py +++ b/utils.py @@ -16,6 +16,26 @@ from datetime import datetime from string import punctuation from collections import defaultdict +from StringIO import StringIO + +from oice.langdet import langdet +from oice.langdet import streams +from oice.langdet import languages + +def detect_language(text): + """ + """ + try: + text_stream = streams.Stream(StringIO(text)) + lang = langdet.LanguageDetector.detect(text_stream) + except: + return 'other' + if lang == languages.french: + return 'french'.encode('utf-8') + elif lang == languages.english: + return 'english'.encode('utf-8') + else: + lang == 'other' def remove_html_tags(data): """ @@ -119,7 +139,8 @@ def load_feed(): pass # articles[feed_id] = (article_id, article_date, article_title, - # article_link, article_description, article_readed) + # article_link, article_description, article_readed, + # article_language) # feeds[feed_id] = (nb_article, nb_article_unreaded, feed_image, # feed_title, feed_link, feed_site_link) articles, feeds = {}, {} @@ -137,8 +158,13 @@ def load_feed(): sha256_hash.update(article[2].encode('utf-8')) article_id = sha256_hash.hexdigest() + if article[3] != "": + language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8')) + else: + language = "other" + article_list = [article_id, article[0], article[1], \ - article[2], article[3], article[4]] + article[2], article[3], article[4], language] if feed_id not in articles: articles[feed_id] = [article_list] -- cgit