From 968e85dfcb22c3073c568e2812c7594df783b7be Mon Sep 17 00:00:00 2001 From: cedricbonhomme Date: Sun, 28 Feb 2010 18:32:09 +0100 Subject: Enhancement of the language detection. --- utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'utils.py') diff --git a/utils.py b/utils.py index b8f65089..e3e209b2 100644 --- a/utils.py +++ b/utils.py @@ -24,6 +24,8 @@ from oice.langdet import languages def detect_language(text): """ + Detect the language of a text. + English, French or other (not detected). """ try: text_stream = streams.Stream(StringIO(text)) @@ -35,7 +37,7 @@ def detect_language(text): elif lang == languages.english: return 'english'.encode('utf-8') else: - lang == 'other' + return 'other' def remove_html_tags(data): """ @@ -159,7 +161,8 @@ def load_feed(): article_id = sha256_hash.hexdigest() if article[3] != "": - language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8')) + language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8') + \ + remove_html_tags(article[1]).encode('utf-8')) else: language = "other" -- cgit