diff options
author | cedricbonhomme <devnull@localhost> | 2010-02-28 18:32:09 +0100 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-02-28 18:32:09 +0100 |
commit | 968e85dfcb22c3073c568e2812c7594df783b7be (patch) | |
tree | 0540f5441b99872e773552c0fdfa76d68c0def2a | |
parent | pyAggr3g470r can now detect the language of an article thanks to the module o... (diff) | |
download | newspipe-968e85dfcb22c3073c568e2812c7594df783b7be.tar.gz newspipe-968e85dfcb22c3073c568e2812c7594df783b7be.tar.bz2 newspipe-968e85dfcb22c3073c568e2812c7594df783b7be.zip |
Enhancement of the language detection.
-rw-r--r-- | utils.py | 7 |
1 files changed, 5 insertions, 2 deletions
@@ -24,6 +24,8 @@ from oice.langdet import languages def detect_language(text): """ + Detect the language of a text. + English, French or other (not detected). """ try: text_stream = streams.Stream(StringIO(text)) @@ -35,7 +37,7 @@ def detect_language(text): elif lang == languages.english: return 'english'.encode('utf-8') else: - lang == 'other' + return 'other' def remove_html_tags(data): """ @@ -159,7 +161,8 @@ def load_feed(): article_id = sha256_hash.hexdigest() if article[3] != "": - language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8')) + language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8') + \ + remove_html_tags(article[1]).encode('utf-8')) else: language = "other" |