diff options
author | Cédric Bonhomme <kimble.mandel@gmail.com> | 2012-12-20 23:31:42 +0100 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel@gmail.com> | 2012-12-20 23:31:42 +0100 |
commit | 13547a54dc2515aeb414683c769913d95564403b (patch) | |
tree | 6edd0ce0a5237fa9ba118781ccedc10ff2d4972b /source/utils.py | |
parent | Typo. (diff) | |
download | newspipe-13547a54dc2515aeb414683c769913d95564403b.tar.gz newspipe-13547a54dc2515aeb414683c769913d95564403b.tar.bz2 newspipe-13547a54dc2515aeb414683c769913d95564403b.zip |
Added getwords(html) function.
Diffstat (limited to 'source/utils.py')
-rwxr-xr-x | source/utils.py | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/source/utils.py b/source/utils.py index d9b21169..14c0096f 100755 --- a/source/utils.py +++ b/source/utils.py @@ -83,6 +83,16 @@ def detect_url_errors(list_of_urls): errors.append((url, e.reason.errno ,e.reason.strerror)) return errors +def getwords(html): + # Remove all the HTML tags + txt=re.compile(r'<[^>]+>').sub('',html) + + # Split words by all non-alpha characters + words=re.compile(r'[^A-Z^a-z]+').split(txt) + + # Convert to lowercase + return [word.lower() for word in words if word!=''] + def clear_string(data): """ Clear a string by removing HTML tags, HTML special caracters |