aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fetch.py4
-rw-r--r--pyaggr3g470r/crawler.py25
-rw-r--r--pyaggr3g470r/feedgetter.py215
-rw-r--r--pyaggr3g470r/views.py2
4 files changed, 28 insertions, 218 deletions
diff --git a/fetch.py b/fetch.py
index 1bf10239..e65fc1b1 100644
--- a/fetch.py
+++ b/fetch.py
@@ -8,7 +8,7 @@
# to fetch articles every 30 minutes.
import sys
-from pyaggr3g470r import feedgetter
+from pyaggr3g470r import crawler
if __name__ == "__main__":
# Point of entry in execution mode
@@ -16,5 +16,5 @@ if __name__ == "__main__":
feed_id = int(sys.argv[2])
except:
feed_id = None
- feed_getter = feedgetter.FeedGetter(sys.argv[1])
+ feed_getter = crawler.FeedGetter(sys.argv[1])
feed_getter.retrieve_feed(feed_id)
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 15849b9d..0393ae31 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -1,6 +1,31 @@
#! /usr/bin/env python
# -*- coding: utf-8 -
+# pyAggr3g470r - A Web based news aggregator.
+# Copyright (C) 2010-2014 Cédric Bonhomme - http://cedricbonhomme.org/
+#
+# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+__author__ = "Cedric Bonhomme"
+__version__ = "$Revision: 2.1 $"
+__date__ = "$Date: 2010/09/02 $"
+__revision__ = "$Date: 2014/04/13 $"
+__copyright__ = "Copyright (c) Cedric Bonhomme"
+__license__ = "AGPLv3"
+
import feedparser
import urllib2
import requests
diff --git a/pyaggr3g470r/feedgetter.py b/pyaggr3g470r/feedgetter.py
deleted file mode 100644
index b06c8eff..00000000
--- a/pyaggr3g470r/feedgetter.py
+++ /dev/null
@@ -1,215 +0,0 @@
-#! /usr/bin/env python
-#-*- coding: utf-8 -*-
-
-# pyAggr3g470r - A Web based news aggregator.
-# Copyright (C) 2010-2014 Cédric Bonhomme - http://cedricbonhomme.org/
-#
-# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-__author__ = "Cedric Bonhomme"
-__version__ = "$Revision: 2.0 $"
-__date__ = "$Date: 2010/09/02 $"
-__revision__ = "$Date: 2013/11/10 $"
-__copyright__ = "Copyright (c) Cedric Bonhomme"
-__license__ = "AGPLv3"
-
-import urllib2
-import requests
-import threading
-import feedparser
-from datetime import datetime
-from BeautifulSoup import BeautifulSoup
-
-from requests.exceptions import Timeout
-
-from sqlalchemy.exc import IntegrityError
-
-import models
-import conf
-if not conf.ON_HEROKU:
- import search
-import utils
-
-if not conf.ON_HEROKU:
- from flask.ext.mail import Message
- from pyaggr3g470r import mail
-
-from pyaggr3g470r import app, db
-from pyaggr3g470r.models import User, Feed, Article
-
-import log
-pyaggr3g470r_log = log.Log("feedgetter")
-
-list_of_threads = []
-
-class FeedGetter(object):
- """
- This class is in charge of retrieving feeds listed in ./var/feed.lst.
- This class uses feedparser module from Mark Pilgrim.
- For each feed a new thread is launched.
- """
- def __init__(self, email):
- """
- Initializes the database connection.
- """
- feedparser.USER_AGENT = conf.USER_AGENT
- if conf.HTTP_PROXY == "":
- self.proxy = urllib2.ProxyHandler({})
- self.proxies = {}
- else:
- self.proxy = urllib2.ProxyHandler({"http" : conf.HTTP_PROXY, \
- "https": conf.HTTP_PROXY})
- self.proxies = {
- "http": "http://" + conf.HTTP_PROXY,
- "https": "http://" + conf.HTTP_PROXY
- }
- feedparser.USER_AGENT = conf.USER_AGENT
- self.user = User.query.filter(User.email == email).first()
-
- def retrieve_feed(self, feed_id=None):
- """
- Parse the file 'feeds.lst' and launch a thread for each RSS feed.
- """
- feeds = [feed for feed in self.user.feeds if feed.enabled]
- if feed_id != None:
- feeds = [feed for feed in feeds if feed.id == feed_id]
- for current_feed in feeds:
- try:
- # launch a new thread for the RSS feed
- thread = threading.Thread(None, self.process, \
- None, (current_feed, ))
- thread.start()
- list_of_threads.append(thread)
- except:
- pass
-
- # wait for all threads are done
- for th in list_of_threads:
- th.join()
-
- def process(self, feed):
- """
- Retrieves articles form the feed and add them to the database.
- """
- a_feed = feedparser.parse(feed.link, handlers = [self.proxy])
- if a_feed['entries'] == []:
- return
-
- # Feed informations
- if feed.title == "":
- try:
- feed.title = a_feed.feed.title
- except:
- feed.title = ""
- if feed.description == "":
- try:
- feed.description = a_feed.feed.subtitle
- except:
- feed.description = ""
-
- articles = []
- for article in a_feed['entries']:
-
- nice_url = article.link.encode("utf-8")
- if conf.RESOLVE_ARTICLE_URL:
- try:
- # resolves URL behind proxies (like feedproxy.google.com)
- r = requests.get(article.link, timeout=5.0, proxies=self.proxies)
- nice_url = r.url.encode("utf-8")
- except Timeout:
- pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,))
- continue
- except Exception as e:
- pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e)))
- continue
- # remove utm_* parameters
- nice_url = utils.clean_url(nice_url)
-
- exist1 = Article.query.filter(Article.user_id == self.user.id, Article.link == nice_url).first()
- exist2 = Article.query.filter(Article.user_id == self.user.id, Article.link == utils.clean_url(article.link.encode("utf-8"))).first()
- if exist1 != None or exist2 != None:
- continue
-
- description = ""
- article_title = ""
- try:
- # article content
- description = article.content[0].value
- except AttributeError:
- try:
- # article description
- description = article.description
- except Exception:
- description = ""
- try:
- description = BeautifulSoup(description, "html.parser").decode()
- article_title = BeautifulSoup(article.title, "html.parser").decode()
- except Exception as E:
- #pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url))
- article_title = article.title
-
- try:
- post_date = datetime(*article.published_parsed[:6])
- except:
- post_date = datetime(*article.updated_parsed[:6])
-
- # save the article
- article = Article(link=nice_url, title=article_title, \
- content=description, readed=False, like=False, date=post_date, \
- user_id=self.user.id, feed_id=feed.id)
- articles.append(article)
-
- # add the article to the Whoosh index only if we are not on Heroku
- """
- if not conf.ON_HEROKU:
- try:
- search.add_to_index([article], feed)
- except Exception as e:
- pyaggr3g470r_log.error("Whoosh error.")
- pass"""
-
- # email notification
- if conf.MAIL_ENABLED and feed.email_notification:
- with app.app_context():
- msg = Message('[pyAggr3g470r] ' + feed.title + ' : ' + article.title, \
- sender = conf.MAIL_FROM, recipients = [conf.MAIL_TO])
- msg.body = utils.clear_string(description)
- msg.html = description
- mail.send(msg)
-
- # add the articles to the list of articles for the current feed
- for article in articles:
- try:
- feed.articles.append(article)
- db.session.merge(article)
- db.session.commit()
- pyaggr3g470r_log.info("New article %s (%s) added." % (article_title, nice_url))
- except IntegrityError:
- pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article_title, nice_url))
- db.session.rollback()
- continue
- except Exception as e:
- pyaggr3g470r_log.error("Error when inserting article in database: " + str(e))
- continue
- db.session.close()
- return True
-
-
-if __name__ == "__main__":
- # Point of entry in execution mode
- feed_getter = FeedGetter()
- # Retrieve all feeds
- feed_getter.retrieve_feed()
diff --git a/pyaggr3g470r/views.py b/pyaggr3g470r/views.py
index c6828b25..e88fe938 100644
--- a/pyaggr3g470r/views.py
+++ b/pyaggr3g470r/views.py
@@ -38,7 +38,7 @@ from werkzeug import generate_password_hash
import conf
import utils
import export
-import feedgetter, crawler
+import crawler
import models
if not conf.ON_HEROKU:
import search as fastsearch
bgstack15