aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcedricbonhomme <devnull@localhost>2010-04-28 08:24:41 +0200
committercedricbonhomme <devnull@localhost>2010-04-28 08:24:41 +0200
commitf3e68af10d4960df8ca59de3efb0ae2f53522f12 (patch)
tree65b2bcf21bb17be6d0291efb1c3494dd734d2059
parentRemoved useless import. (diff)
downloadnewspipe-f3e68af10d4960df8ca59de3efb0ae2f53522f12.tar.gz
newspipe-f3e68af10d4960df8ca59de3efb0ae2f53522f12.tar.bz2
newspipe-f3e68af10d4960df8ca59de3efb0ae2f53522f12.zip
New regular expression to remove special characters (for instance &#8217;).
-rwxr-xr-xutils.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/utils.py b/utils.py
index 3b8b376a..fc945f34 100755
--- a/utils.py
+++ b/utils.py
@@ -74,7 +74,8 @@ def remove_html_tags(data):
Remove HTML tags for the search.
"""
p = re.compile(r'<[^<]*?/?>')
- return p.sub('', data)
+ q = re.compile(r'&#[0-9]+;')
+ return p.sub('', q.sub('', data))
def top_words(dic_articles, n=10, size=5):
"""
bgstack15