aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcedricbonhomme <devnull@localhost>2010-02-28 18:32:09 +0100
committercedricbonhomme <devnull@localhost>2010-02-28 18:32:09 +0100
commit968e85dfcb22c3073c568e2812c7594df783b7be (patch)
tree0540f5441b99872e773552c0fdfa76d68c0def2a
parentpyAggr3g470r can now detect the language of an article thanks to the module o... (diff)
downloadnewspipe-968e85dfcb22c3073c568e2812c7594df783b7be.tar.gz
newspipe-968e85dfcb22c3073c568e2812c7594df783b7be.tar.bz2
newspipe-968e85dfcb22c3073c568e2812c7594df783b7be.zip
Enhancement of the language detection.
-rw-r--r--utils.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/utils.py b/utils.py
index b8f65089..e3e209b2 100644
--- a/utils.py
+++ b/utils.py
@@ -24,6 +24,8 @@ from oice.langdet import languages
def detect_language(text):
"""
+ Detect the language of a text.
+ English, French or other (not detected).
"""
try:
text_stream = streams.Stream(StringIO(text))
@@ -35,7 +37,7 @@ def detect_language(text):
elif lang == languages.english:
return 'english'.encode('utf-8')
else:
- lang == 'other'
+ return 'other'
def remove_html_tags(data):
"""
@@ -159,7 +161,8 @@ def load_feed():
article_id = sha256_hash.hexdigest()
if article[3] != "":
- language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8'))
+ language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8') + \
+ remove_html_tags(article[1]).encode('utf-8'))
else:
language = "other"
bgstack15