From 901fbd154f16268ca4c9d10af8d038d684c8c4f4 Mon Sep 17 00:00:00 2001 From: cedricbonhomme Date: Thu, 8 Nov 2012 23:08:35 +0100 Subject: Porting to Python 3.2. Better, faster, stronger. --- source/utils.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'source/utils.py') diff --git a/source/utils.py b/source/utils.py index 7d1eaecc..da26d022 100755 --- a/source/utils.py +++ b/source/utils.py @@ -37,18 +37,18 @@ __license__ = "GPLv3" import os import re import operator -import urlparse +import urllib.parse import calendar import unicodedata -import htmlentitydefs +import html.entities import smtplib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText -import urllib2 -import BaseHTTPServer -from BeautifulSoup import BeautifulSoup +import urllib.request, urllib.error, urllib.parse +import http.server +from bs4 import BeautifulSoup from datetime import datetime from collections import Counter @@ -70,14 +70,14 @@ def detect_url_errors(list_of_urls): """ errors = [] for url in list_of_urls: - req = urllib2.Request(url) + req = urllib.request.Request(url) try: - urllib2.urlopen(req) - except urllib2.HTTPError, e: + urllib.request.urlopen(req) + except urllib.error.HTTPError as e: # server couldn't fulfill the request errors.append((url, e.code, \ - BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code][1])) - except urllib2.URLError, e: + http.server.BaseHTTPRequestHandler.responses[e.code][1])) + except urllib.error.URLError as e: # failed to reach the server errors.append((url, e.reason.errno ,e.reason.strerror)) return errors @@ -87,9 +87,9 @@ def clear_string(data): Clear a string by removing HTML tags, HTML special caracters and consecutive white spaces (more that one). """ - p = re.compile(r'<[^<]*?/?>') # HTML tags - q = re.compile(r'\s') # consecutive white spaces - return p.sub('', q.sub(' ', data)) + p = re.compile(b'<[^<]*?/?>') # HTML tags + q = re.compile(b'\s') # consecutive white spaces + return p.sub(b'', q.sub(b' ', bytes(data, "utf-8"))).decode("utf-8", "strict") def unescape(text): """ @@ -101,15 +101,15 @@ def unescape(text): # character reference try: if text[:3] == "&#x": - return unichr(int(text[3:-1], 16)) + return chr(int(text[3:-1], 16)) else: - return unichr(int(text[2:-1])) + return chr(int(text[2:-1])) except ValueError: pass else: # named entity try: - text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + text = chr(html.entities.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is @@ -244,7 +244,7 @@ def change_feed_url(old_feed_url, new_feed_url): # Replace the URL in the text file with open("./var/feed.lst", "r") as f: lines = f.readlines() - lines = map(str.strip, lines) + lines = list(map(str.strip, lines)) try: lines[lines.index(old_feed_url)] = new_feed_url except: @@ -271,7 +271,7 @@ def search_feed(url): """ soup = None try: - page = urllib2.urlopen(url) + page = urllib.request.urlopen(url) soup = BeautifulSoup(page) except: return None @@ -279,6 +279,6 @@ def search_feed(url): feed_links.extend(soup('link', type='application/rss+xml')) for feed_link in feed_links: if url not in feed_link['href']: - return urlparse.urljoin(url, feed_link['href']) + return urllib.parse.urljoin(url, feed_link['href']) return feed_link['href'] return None -- cgit