diff options
author | cedricbonhomme <devnull@localhost> | 2010-02-28 18:06:10 +0100 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-02-28 18:06:10 +0100 |
commit | 625c7a044c3c944683762b7bb75dec9dd5b53fb9 (patch) | |
tree | a8f448682cb05631b3f3e8934220a5be61351814 | |
parent | Minor enhancements of the code. (diff) | |
download | newspipe-625c7a044c3c944683762b7bb75dec9dd5b53fb9.tar.gz newspipe-625c7a044c3c944683762b7bb75dec9dd5b53fb9.tar.bz2 newspipe-625c7a044c3c944683762b7bb75dec9dd5b53fb9.zip |
pyAggr3g470r can now detect the language of an article thanks to the module oice.langdet.
-rw-r--r-- | pyAggr3g470r.py | 55 | ||||
-rw-r--r-- | utils.py | 30 |
2 files changed, 79 insertions, 6 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index c197033f..e1251664 100644 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -131,6 +131,7 @@ class Root: def management(self): """ + Management of articles. """ self.articles, self.feeds = utils.load_feed() html = htmlheader @@ -174,7 +175,27 @@ class Root: html += "</ol>\n</td><td>" utils.create_histogram(top_words) html += """<img src="/var/histogram.png" /></td></tr></table>""" - html += "<hr />\n" + + nb_french = 0 + nb_english = 0 + nb_other = 0 + for rss_feed_id in self.articles.keys(): + for article in self.articles[rss_feed_id]: + if article[6] == 'french': + nb_french += 1 + elif article[6] == 'english': + nb_english += 1 + else: + nb_other +=1 + + html += "<h1>Languages</h1>\n" + html += "<ul>\n" + for language in ['english', 'french', 'other']: + html += """<li>%s articles in <a href="/language/%s">%s</a></li>\n""" % \ + (locals()["nb_"+language], + language, language) + html += "</ul>\n" + html += "<hr />\n" html += htmlfooter return html @@ -274,7 +295,9 @@ class Root: html += description else: html += "No description available." - html += """<hr />\n<a href="%s">Complete story</a>\n<br />\n""" % \ + html += "<hr />\n" + html += """This article is written in %s.""" % (article[6],) + html += """<br /><a href="%s">Complete story</a>\n<br />\n""" % \ (article[3].encode('utf-8'),) # Share this article: # on delicious @@ -310,7 +333,7 @@ class Root: def all_articles(self, feed_id): """ - Display all articles of a feed ('feed_title'). + Display all articles of a feed. """ html = htmlheader html += htmlnav @@ -357,7 +380,7 @@ class Root: def unread(self, feed_id): """ - Display all unread articles of a feed ('feed_title'). + Display all unread articles of a feed. """ html = htmlheader html += htmlnav @@ -393,6 +416,30 @@ class Root: unread.exposed = True + def language(self, lang): + """ + """ + html = htmlheader + html += htmlnav + html += """</div> <div class="left inner">""" + + html += """<h1>Article(s) written in %s</h1>""" % (lang,) + + for rss_feed_id in self.articles.keys(): + for article in self.articles[rss_feed_id]: + if article[6] == lang: + html += article[1].encode('utf-8') + \ + """ - <a href="/description/%s" rel="noreferrer" target="_blank">%s</a> + from <i><a href="%s">%s</a></i><br />\n""" % \ + (article[0].encode('utf-8'), article[2].encode('utf-8'), \ + self.feeds[rss_feed_id][5].encode('utf-8'), \ + self.feeds[rss_feed_id][3].encode('utf-8')) + + html += "<hr />\n" + html += htmlfooter + return html + + language.exposed = True def mark_as_read(self, target): """ @@ -16,6 +16,26 @@ from datetime import datetime from string import punctuation from collections import defaultdict +from StringIO import StringIO + +from oice.langdet import langdet +from oice.langdet import streams +from oice.langdet import languages + +def detect_language(text): + """ + """ + try: + text_stream = streams.Stream(StringIO(text)) + lang = langdet.LanguageDetector.detect(text_stream) + except: + return 'other' + if lang == languages.french: + return 'french'.encode('utf-8') + elif lang == languages.english: + return 'english'.encode('utf-8') + else: + lang == 'other' def remove_html_tags(data): """ @@ -119,7 +139,8 @@ def load_feed(): pass # articles[feed_id] = (article_id, article_date, article_title, - # article_link, article_description, article_readed) + # article_link, article_description, article_readed, + # article_language) # feeds[feed_id] = (nb_article, nb_article_unreaded, feed_image, # feed_title, feed_link, feed_site_link) articles, feeds = {}, {} @@ -137,8 +158,13 @@ def load_feed(): sha256_hash.update(article[2].encode('utf-8')) article_id = sha256_hash.hexdigest() + if article[3] != "": + language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8')) + else: + language = "other" + article_list = [article_id, article[0], article[1], \ - article[2], article[3], article[4]] + article[2], article[3], article[4], language] if feed_id not in articles: articles[feed_id] = [article_list] |