From 13547a54dc2515aeb414683c769913d95564403b Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Thu, 20 Dec 2012 23:31:42 +0100 Subject: Added getwords(html) function. --- source/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'source') diff --git a/source/utils.py b/source/utils.py index d9b21169..14c0096f 100755 --- a/source/utils.py +++ b/source/utils.py @@ -83,6 +83,16 @@ def detect_url_errors(list_of_urls): errors.append((url, e.reason.errno ,e.reason.strerror)) return errors +def getwords(html): + # Remove all the HTML tags + txt=re.compile(r'<[^>]+>').sub('',html) + + # Split words by all non-alpha characters + words=re.compile(r'[^A-Z^a-z]+').split(txt) + + # Convert to lowercase + return [word.lower() for word in words if word!=''] + def clear_string(data): """ Clear a string by removing HTML tags, HTML special caracters -- cgit