diff options
author | cedricbonhomme <devnull@localhost> | 2010-04-28 08:24:41 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-04-28 08:24:41 +0200 |
commit | f3e68af10d4960df8ca59de3efb0ae2f53522f12 (patch) | |
tree | 65b2bcf21bb17be6d0291efb1c3494dd734d2059 | |
parent | Removed useless import. (diff) | |
download | newspipe-f3e68af10d4960df8ca59de3efb0ae2f53522f12.tar.gz newspipe-f3e68af10d4960df8ca59de3efb0ae2f53522f12.tar.bz2 newspipe-f3e68af10d4960df8ca59de3efb0ae2f53522f12.zip |
New regular expression to remove special characters (for instance ’).
-rwxr-xr-x | utils.py | 3 |
1 files changed, 2 insertions, 1 deletions
@@ -74,7 +74,8 @@ def remove_html_tags(data): Remove HTML tags for the search. """ p = re.compile(r'<[^<]*?/?>') - return p.sub('', data) + q = re.compile(r'&#[0-9]+;') + return p.sub('', q.sub('', data)) def top_words(dic_articles, n=10, size=5): """ |