diff options
author | cedricbonhomme <devnull@localhost> | 2010-02-28 18:06:10 +0100 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-02-28 18:06:10 +0100 |
commit | 625c7a044c3c944683762b7bb75dec9dd5b53fb9 (patch) | |
tree | a8f448682cb05631b3f3e8934220a5be61351814 /utils.py | |
parent | Minor enhancements of the code. (diff) | |
download | newspipe-625c7a044c3c944683762b7bb75dec9dd5b53fb9.tar.gz newspipe-625c7a044c3c944683762b7bb75dec9dd5b53fb9.tar.bz2 newspipe-625c7a044c3c944683762b7bb75dec9dd5b53fb9.zip |
pyAggr3g470r can now detect the language of an article thanks to the module oice.langdet.
Diffstat (limited to 'utils.py')
-rw-r--r-- | utils.py | 30 |
1 files changed, 28 insertions, 2 deletions
@@ -16,6 +16,26 @@ from datetime import datetime from string import punctuation from collections import defaultdict +from StringIO import StringIO + +from oice.langdet import langdet +from oice.langdet import streams +from oice.langdet import languages + +def detect_language(text): + """ + """ + try: + text_stream = streams.Stream(StringIO(text)) + lang = langdet.LanguageDetector.detect(text_stream) + except: + return 'other' + if lang == languages.french: + return 'french'.encode('utf-8') + elif lang == languages.english: + return 'english'.encode('utf-8') + else: + lang == 'other' def remove_html_tags(data): """ @@ -119,7 +139,8 @@ def load_feed(): pass # articles[feed_id] = (article_id, article_date, article_title, - # article_link, article_description, article_readed) + # article_link, article_description, article_readed, + # article_language) # feeds[feed_id] = (nb_article, nb_article_unreaded, feed_image, # feed_title, feed_link, feed_site_link) articles, feeds = {}, {} @@ -137,8 +158,13 @@ def load_feed(): sha256_hash.update(article[2].encode('utf-8')) article_id = sha256_hash.hexdigest() + if article[3] != "": + language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8')) + else: + language = "other" + article_list = [article_id, article[0], article[1], \ - article[2], article[3], article[4]] + article[2], article[3], article[4], language] if feed_id not in articles: articles[feed_id] = [article_list] |