From 4a8438d7f2b7b16941240b91f39a9402c431ffc2 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Tue, 2 Feb 2016 23:30:57 +0100 Subject: writing a bit of doc, moving crawler together --- AUTHORS.rst | 6 +- CHANGELOG.md | 512 ---------------------------------------- CHANGELOG.rst | 518 +++++++++++++++++++++++++++++++++++++++++ README.rst | 31 ++- src/crawler.py | 168 ------------- src/crawler/classic_crawler.py | 168 +++++++++++++ src/crawler/http_crawler.py | 251 ++++++++++++++++++++ src/manager.py | 6 +- src/web/lib/crawler.py | 251 -------------------- src/web/utils.py | 4 +- 10 files changed, 959 insertions(+), 956 deletions(-) delete mode 100644 CHANGELOG.md create mode 100644 CHANGELOG.rst delete mode 100644 src/crawler.py create mode 100644 src/crawler/classic_crawler.py create mode 100644 src/crawler/http_crawler.py delete mode 100644 src/web/lib/crawler.py diff --git a/AUTHORS.rst b/AUTHORS.rst index 5a6f2cc0..dfac5267 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -1,6 +1,8 @@ -pyAggr3g470r is a free software written and maintained +Original author +--------------- +JARR is a free software written and maintained by Cédric Bonhomme https://www.cedricbonhomme.org Contributors -```````````` +------------ - François Schmidts http://1pxsolidblack.pl/ diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 1f3edea2..00000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,512 +0,0 @@ -================= -Release History -================= - -6.7 (2015-07-21) ----------------- - New: - * a filter mechanism for feeds has been added (PR #14); - * icon of feeds is now an url retrieved from the feed or the site link (PR #15). - Improvements: - * improvements for the bookmarklet (PR #16 and PR #18); - * performance improvements (display of the /feed page); - * enhancements for the two crawlers; - * enhancements of the UI (PR #14); - * misc changes to the models and controllers. - -6.6 (2015-06-02) ----------------- - New: - * it is now possible to sort the list of articles by feed title or - article title; - * added base unittests. - Improvements: - * fixed some minor bugs; - * improved the asyncio crawler; - * automatically use the good Python executable for the asyncio crawler; - * improved controllers (enforcing better use of user_id); - * the search is now case insensitive. - -6.5.5 (2015-04-22) ------------------- - The full text search powered by Whoosh has been removed. - -6.5.4 (2015-04-16) ------------------- - This release introduces a new config module and a new search functionality. - The result of a search is integrated in the main page. - -6.5.3 (2015-04-14) ------------------- - The fetch call is now dependent to the selected crawling method. - -6.5.2 (2015-04-14) ------------------- - The look and feel has been globally improved. - It is now possible to add a new feed from any page via a dropdown menu. - -6.5.1 (2015-04-08) ------------------- - Improvements: - * improvements on the controllers; - * the minimum error count is now specified in the configuration file. - -6.5 (2015-04-07) ----------------- - Improvements: - * new CSS; - * improved installation script; - * it is now possible to delete all duplicate articles with one HTTP - delete request. - -6.4 (2015-03-17) ----------------- - New: - * a new page 'history' to explore your database of articles. - Changes: - * updated documentation; - * minor improvements. - Fixes: - * changed the type of the column 'last_modified' to string. - -6.3 (2015-03-08) ----------------- - New: - * a new architecture with base for controllers; - * new, more complete RESTful API; - * a crawler handling errors and cache control; - * the new crawler takes advantage of the RESTful API - (can be run on the client side). - -6.2 (2015-02-26) ----------------- - The system of email notifications for new articles has been removed. - This feature was hardly used. - -6.1 (2015-02-23) ----------------- - Improvements: articles are now identified with the id provided - by the RSS/ATOM feed. - Prevent BeautifulSoup4 from adding extra '' tags to - the soup with the 'lxml' parser. - Indexation is now used with the new crawler. - The documentation has been updated. - -6.0 (2015-02-08) ----------------- - New: pyAggr3g470r is now working with Python 3.4. Tested on Heroku - with Python 3.4.2. - Improvements: The feed crawler uses the PEP 3156 (asyncio). The action - buttons are now on the left. It is easier to mark an article as read. - -5.7.0 (2014-11-20) ------------------- - Improvements: major improvements of the email notifications system. - Notifications are now sent through Postmark (for example for Heroku) - or a standard SMTP server. - Bug fix: activation key was too long for the database column. - -5.6.6 (2014-09-24) ------------------- - Improvements: Significant performance improvement for the views - /favorites and /unread. The performance problem has been introduced - with the release 5.6.5. - -5.6.5 (2014-09-15) ------------------- - This release fixes a major bug introduced with the version 0.9.7 of SQLAlchemy - (conflicts between persistent ant transcient SQLAlchemy objects). - -5.6.4 (2014-09-12) ------------------- - Translations have been updated. - Gravatar extension has been removed. - Minor fix. - -5.6.3 (2014-08-27) ------------------- - News: It is now possible to delete articles older than a given number - of weeks. This can be done via the management page. - A new environment variable enables to activate or deactivate the - self-registration. - Changes: translations were updated. Some minor bugfixes. Improved - deployment instructions in the README. - -5.6.2 (2014-08-10) ------------------- - Minor improvements: Articles are automatically retrieved after the import - of an OPML file. - When displaying all articles (unread + read), titles of unread articles - are emphasized in bold. - A new tab is opened when clicking on the title of an article. - New: pyAggr3g470r can be deployed with the Heroku auto deploy button. - -5.6.1 (2014-07-13) ------------------- - Performance improvements: faster database insertion of articles by - the crawler and loading of the management page. - Minor bug fixes. - -5.6 (2014-07-05) ----------------- - pyAggr3g470r has now a RESTful JSON API which enables to manage Feed and - Article objects. The API is documented in the README file. - The main page is using a subset of this API with jQuery HTTP requests. - -5.5 (2014-06-14) ----------------- - This release introduces a redesigned homepage which loads much faster and - is easier to read. pyAggr3g470r can now be run by Apache. - Adding a feed no longer requires a title and a site link. - -5.4 (2014-05-28) ----------------- - This version makes it possible for a user to export all of their feeds and - articles as a JSON file for later restoration. - -5.3 (2014-05-23) ----------------- - This release introduces some UI improvements, especially for the home page. - -5.2 (2014-05-16) ----------------- - This release adds minor bug fixes and UI improvements. - -5.1 (2014-05-13) ----------------- - When deployed on Heroku, the platform now uses the Postmark service to - send account confirmation emails to users. It is no longer required to - enter a first name and a last name to create an account. - -5.0 (2014-05-04) ----------------- - pyAggr3g470r is now translated into English and French. Improvements - concerning the news retriever and the Whoosh search functionality have - been made. The user can now export all articles in JSON format. - The user of the platform now has the possibility to delete his or her - account. - -4.9 (2014-04-24) ----------------- - This version introduces minor improvements to the Jinja templates and - bugfixes (relating to the import of OPML files with sub-categories and - relating to the Whoosh index generation). - -4.8 (2014-04-13) ----------------- - Feeds are now retrieved in a separated process with the Gevent library. - This offers the best performance on Heroku. - -4.7 (2014-04-12) ----------------- - pyAggr3g470r can now be deployed on Heroku or on a traditional server. - Moreover, several users can use an instance of pyAggr3g470r. A platform is - managed by the administrator, a user with specific rights. - -4.6 (2014-02-09) ----------------- - This release introduces the import of OPML files of subscriptions. - Minor improvements have been made to the templates. - -4.5 (2014-01-29) ----------------- - This release introduces a one step installation process with a simple - script. Minor improvements to the feedgetter module have been introduced - (the feed description is now stored in the database). Miscellaneous - improvements to the Jinja templates. Finally, more configuration options - are now offered to the user. - -4.4 (2013-12-27) ----------------- - This version introduces some improvements for the feedgetter module - including automatic retrieval of the URL behind feedproxy.google.com, - and support for configuring the user agent and proxy. Minor improvements - were made to the MongoEngine models. Notifications are displayed with - Flask flash messages. - -4.3 (2013-12-03) ----------------- - With this release, the user is able to update her personal information. - It is now possible to enable/disable the checking of updates for a feed. - Some performance improvements and user interface optimizations have been - done. - -4.2 (2013-11-10) ----------------- - This is the first release of the new version of pyAggr3g470r. - The code has been rewritten with the Flask microframework and the - Bootstrap frontend framework. - -4.1 (2013-08-11) ----------------- - HTTP proxy support has been added for the fetching of feeds. This is - useful, for example, if you are using Privoxy/Tor. - -4.0 (2013-06-25) ----------------- - Searching of articles is now achieved with Whoosh, a fast full-text - indexing and searching library. - -3.9 (2013-04-14) ----------------- - The code has been tested and ported to Python 3.3.1. Some minor bugs have - been fixed, with a lot of improvements concerning the Mako templates, - MongoDB database management, and management of exceptions. - -3.8 (2013-01-12) ----------------- - This release introduces a reworked management page; it is now possible to - change the username and password via this page. - Some improvements concerning the HTML export of the database have been - made. Finally, indexed MongoDB full text searching provides a much faster - search. - -3.7 (2012-12-29) ----------------- - pyAggr3g470r is now using the Mako template library. - Performance improvements for the MongoDB database have been made, and some - minor bugfixes. Stop words (a, of, the, an, for...) are now ignored when - calculating top words for the generation of the tag cloud. - A new page indicates the list of inactive feeds (with no new published - articles since a given number of days). - -3.6 (2012-11-08) ----------------- - pyAggr3g470r is now running with Python 3.2(.3). It uses CherryPy 3.2.2, - BeautifulSoup4, and feedparser3.Your old MongoDB database can be used - without any problem. - -3.5 (2012-10-28) ----------------- - Some minor bugfixes and improvements. - An authentication screen has been added, which is useful if pyAggr3g470r - is running on an EC2 instance, for example. - -3.4 (2012-05-01) ----------------- - This version introduces some minor improvements and bugfixes. - All features of pyAggr3g470r are now back (with MongoDB). - -3.3 (2012-04-16) ----------------- - This version introduces minor improvements and a bugfix. - Publication dates of articles are now stored as a datetime object. - A bug in the script that converts an SQLite database to a MongoDB database - is now fixed. - A little documentation has been added. - -3.2 (2012-03-20) ----------------- - A MongoDB database is now used instead of the SQLite database. This change - offers a significant performance improvement. The database has been tested - with more than 30,000 articles, but version 3.2 is still a test version. - A more stable version will arrive soon. - -3.1 (2011-11-29) ----------------- - A new version of the QR Code module is used. For each article, a QR Code - is generated based on the content of the article. If the article is too - long, only the article's URL is encoded in the QR Code. For a given - string, the algorithm tries the generate as small a QR Code as possible. - Minor bugs were fixed. - -3.0 (2011-10-25) ----------------- - This release introduces exportation of articles to the HTML format and to - the PDF format (there is still exportation to ePub). - The sharing of articles with delicious.com was replaced by pinboard.in.s - -2.9 (2011-08-26) ----------------- - Some minor improvements. A bug with the HTML tag bas been fixed. - Cleanup was done with Pylint. - The test database of pyAggr3g470r contains more than 22000 articles, - and it runs perfectly. - -2.8 (2011-07-08) ----------------- - The feed summary page, which displays general information about a feed, - now lets you change the feed metadata (feed logo, feed name, and feed URL - if changed). Moreover, this page displays the activity of a feed and other - useful information. It is now possible to set a different POD for Diaspora - in the configuration file and to share an article with Google +1. - A control file to start or stop pyAggr3g470r has been added. - From the GUI side, a new transparent CSS tooltip has been introduced in - order to preview an article. - Finally, some minor performance improvements and bugfixes were made. - -2.7 (2011-04-15) ----------------- - Minor improvements. - It is now possible to set a maximum number of articles to be loaded from - the database for each feed (via the management page). - -2.6 (2011-03-21) ----------------- - This version introduces a new page that displays general information about - a feed. There are some minor improvements in the Web interface. - The version of pyAggr3g470r for Python 3 is now fully ready and has been - tested with Python 3.2. - -2.5 (2011-01-19) ----------------- - A bug when removing a feed from the data base was fixed. - Minor improvements were made for export of articles and the size of HTML - forms. - -2.4 (2010-12-07) ----------------- - The GUI uses more HTML 5 features like HTML5 Forms Validation - (email input, URL input), an HTML5 month+year date picker, and a - placeholder. From each article it is possible to access the - following and previous article (and a new main menu with CSS ToolTip). - Articles can now be exported to the EPUB format. Articles loaded from the - SQLite base are now stored in memory in a better data structure. With more - than 10,000 articles, pyAggr3g470r starts in 3 seconds. Finally, email - notifications are now sent with HTML message content and with an - alternative plain text version (MIMEMultipart). - -2.3 (2010-11-15) ----------------- - This version introduces HTML5 Forms Validation and a HTML5 month+year date - picker for the history page, which can be used to search for articles. - This currently only works with Opera. - -2.2 (2010-11-03) ----------------- - There is now a third way to export articles from the SQLite base. - There is an export method for the wiki DokuWiki (example in the commit - message). - -2.1 (2010-10-25) ----------------- - The export of articles to HTML has been updated, with better output. - There are a number of improvements (the search function, generation of - tags cloud, display of article content, CSS, bugfixes, etc.). - There is a new Wiki. - -2.0 (2010-09-03) ----------------- - It is now possible to browse articles by year and month with tag clouds - (see new screenshots). - In addition, URL errors are detected before downloading feeds. - There are some improvements in the user interface. - -1.9 (2010-09-02) ----------------- - The feedgetter module was improved. More details about articles are stored - in the database when possile. An attempt is made to get the whole article - (a_feed['entries'][i].content[j].value), and in the event of failure, - the description/summary is used (a_feed['entries'][i].description). - -1.8 (2010-08-25) ----------------- - It is now easier to install pyAggr3g470r. - There is no longer any need to set any path in the configuration file. - -1.7 (2010-07-23) ----------------- - This release generates QR codes with URLs of articles, so you can read an - article later on your smartphone (or share with friends). - -1.6 (2010-07-08) ----------------- - It is now possible to automatically add a feed (with the URL of the site), - delete an article, delete a feed with all its articles, and to delete all - articles from the database. - There are also some nice improvements to performance, tested with more - than 3000 articles. - Furthermore, HTML export of all the articles of the database was improved. - You can also export the articles in raw text. Finally, some minor bugs - were fixed. - -1.5 (2010-07-05) ----------------- - Now pyAggr3g470r only works with Python 2.7. - OrderedDict objects are used in order to sort the feeds alphabetically in - a simple way. - -1.4 (2010-06-10) ----------------- - It is now possible to remove all articles of a given feed from the SQLite - base via the management page. You can also add a feed just with the URL - of the Web page. The URL of the feed is obtained by parsing the Web page - with the module BeautifulSoup. - -1.3 (2010-05-04) ----------------- - All articles stored in the SQLite database can now be exported as HTML or - raw text via the management page. - -1.2 (2010-04-29) ----------------- - This version introduces a tag cloud with variable word length. - Some improvements were made to the CSS and a bug was fixed. - -1.1 (2010-04-15) ----------------- - Introduction of a Google Buzz button. - It is now possible to mark or unmark articles as favorites. - -1.0 (2010-03-23) ----------------- - The database of feeds is monitored with the Python gamin module, - if present. Otherwise it is done with a classic function. - You now have the option to be informed of new articles by email. To - receive these notifications, just click on "Stay tuned" for the - desired feed(s) at the main page of pyAggr3g470r in the browser. - -0.9 (2010-02-28) ----------------- - TuxDroid tells you when there are unread articles (this module is - independent in case you don't have a TuxDroid). Moreover, the language of - articles is detected (thanks to the oice.langdet Python module). This - allows you to search for articles by language. - -0.8 (2010-02-24) ----------------- - It is now possible to share articles with delicious, Digg, reddit, - Scoopeo, and Blogmarks. - The "Management of feeds" page presents information on the database and - statistics on articles (with a histogram). HTML tags are now skipped for - the search. Some other improvements were made. - -0.7 (2010-02-15) ----------------- - It is now possible to search for an article, through the titles and - descriptions. - -0.6 (2010-02-05) ----------------- - Unread articles are now shown in bold. This was implemented using a new - field in the SQLite database. New tabs for article descriptions are opened - with the _rel=noreferrer_ option in order to separate processes (useful - with Chromium). It is now possible to see only unread articles for each feed. - -0.5 (2010-02-02) ----------------- - It is now possible to fetch feeds manually by clicking on "Fetch all feeds" - and/or with cron. Better navigation between feeds and improvements to the - SQLite database have been added. - -0.4 (2010-02-01) ----------------- - Release 0.4. The main page display only 10 articles by feeds. - For each feeds a page present the list of all articles. The SQLite base is - smaller than before (removed hashed value). - A lot of improvements. - -0.3 (2010-02-01) ----------------- - A new menu was added for faster access to feeds. Some improvements were - made to the CSS. - -0.2 (2010-01-31) ----------------- - Articles are now sorted by date, and it is possible to read just a - description of an article. There are some improvements in the code and - SQLite base management. - -0.1 (2010-01-29) ----------------- - First release of pyAggr3g470r. diff --git a/CHANGELOG.rst b/CHANGELOG.rst new file mode 100644 index 00000000..185dea8a --- /dev/null +++ b/CHANGELOG.rst @@ -0,0 +1,518 @@ +================= +Release History +================= + +current +------- + New: + * Redoing entierly the home page with react, JARR is going on toward a one page app. + * Implementing categories + Improvements: + * Code re-arangement: move all code to /src/ + +6.7 (2015-07-21) +---------------- + New: + * a filter mechanism for feeds has been added (PR #14); + * icon of feeds is now an url retrieved from the feed or the site link (PR #15). + Improvements: + * improvements for the bookmarklet (PR #16 and PR #18); + * performance improvements (display of the /feed page); + * enhancements for the two crawlers; + * enhancements of the UI (PR #14); + * misc changes to the models and controllers. + +6.6 (2015-06-02) +---------------- + New: + * it is now possible to sort the list of articles by feed title or article title; + * added base unittests. + Improvements: + * fixed some minor bugs; + * improved the asyncio crawler; + * automatically use the good Python executable for the asyncio crawler; + * improved controllers (enforcing better use of user_id); + * the search is now case insensitive. + +6.5.5 (2015-04-22) +------------------ + The full text search powered by Whoosh has been removed. + +6.5.4 (2015-04-16) +------------------ + This release introduces a new config module and a new search functionality. + The result of a search is integrated in the main page. + +6.5.3 (2015-04-14) +------------------ + The fetch call is now dependent to the selected crawling method. + +6.5.2 (2015-04-14) +------------------ + The look and feel has been globally improved. + It is now possible to add a new feed from any page via a dropdown menu. + +6.5.1 (2015-04-08) +------------------ + Improvements: + * improvements on the controllers; + * the minimum error count is now specified in the configuration file. + +6.5 (2015-04-07) +---------------- + Improvements: + * new CSS; + * improved installation script; + * it is now possible to delete all duplicate articles with one HTTP delete request. + +6.4 (2015-03-17) +---------------- + New: + * a new page 'history' to explore your database of articles. + Changes: + * updated documentation; + * minor improvements. + Fixes: + * changed the type of the column 'last_modified' to string. + +6.3 (2015-03-08) +---------------- + New: + * a new architecture with base for controllers; + * new, more complete RESTful API; + * a crawler handling errors and cache control; + * the new crawler takes advantage of the RESTful API + (can be run on the client side). + +6.2 (2015-02-26) +---------------- + The system of email notifications for new articles has been removed. + This feature was hardly used. + +6.1 (2015-02-23) +---------------- + Improvements: articles are now identified with the id provided + by the RSS/ATOM feed. + Prevent BeautifulSoup4 from adding extra '' tags to + the soup with the 'lxml' parser. + Indexation is now used with the new crawler. + The documentation has been updated. + +6.0 (2015-02-08) +---------------- + New: pyAggr3g470r is now working with Python 3.4. Tested on Heroku + with Python 3.4.2. + Improvements: The feed crawler uses the PEP 3156 (asyncio). The action + buttons are now on the left. It is easier to mark an article as read. + +5.7.0 (2014-11-20) +------------------ + Improvements: major improvements of the email notifications system. + Notifications are now sent through Postmark (for example for Heroku) + or a standard SMTP server. + Bug fix: activation key was too long for the database column. + +5.6.6 (2014-09-24) +------------------ + Improvements: Significant performance improvement for the views + /favorites and /unread. The performance problem has been introduced + with the release 5.6.5. + +5.6.5 (2014-09-15) +------------------ + This release fixes a major bug introduced with the version 0.9.7 of SQLAlchemy + (conflicts between persistent ant transcient SQLAlchemy objects). + +5.6.4 (2014-09-12) +------------------ + Translations have been updated. + Gravatar extension has been removed. + Minor fix. + +5.6.3 (2014-08-27) +------------------ + News: It is now possible to delete articles older than a given number + of weeks. This can be done via the management page. + A new environment variable enables to activate or deactivate the + self-registration. + Changes: translations were updated. Some minor bugfixes. Improved + deployment instructions in the README. + +5.6.2 (2014-08-10) +------------------ + Minor improvements: Articles are automatically retrieved after the import + of an OPML file. + When displaying all articles (unread + read), titles of unread articles + are emphasized in bold. + A new tab is opened when clicking on the title of an article. + New: pyAggr3g470r can be deployed with the Heroku auto deploy button. + +5.6.1 (2014-07-13) +------------------ + Performance improvements: faster database insertion of articles by + the crawler and loading of the management page. + Minor bug fixes. + +5.6 (2014-07-05) +---------------- + pyAggr3g470r has now a RESTful JSON API which enables to manage Feed and + Article objects. The API is documented in the README file. + The main page is using a subset of this API with jQuery HTTP requests. + +5.5 (2014-06-14) +---------------- + This release introduces a redesigned homepage which loads much faster and + is easier to read. pyAggr3g470r can now be run by Apache. + Adding a feed no longer requires a title and a site link. + +5.4 (2014-05-28) +---------------- + This version makes it possible for a user to export all of their feeds and + articles as a JSON file for later restoration. + +5.3 (2014-05-23) +---------------- + This release introduces some UI improvements, especially for the home page. + +5.2 (2014-05-16) +---------------- + This release adds minor bug fixes and UI improvements. + +5.1 (2014-05-13) +---------------- + When deployed on Heroku, the platform now uses the Postmark service to + send account confirmation emails to users. It is no longer required to + enter a first name and a last name to create an account. + +5.0 (2014-05-04) +---------------- + pyAggr3g470r is now translated into English and French. Improvements + concerning the news retriever and the Whoosh search functionality have + been made. The user can now export all articles in JSON format. + The user of the platform now has the possibility to delete his or her + account. + +4.9 (2014-04-24) +---------------- + This version introduces minor improvements to the Jinja templates and + bugfixes (relating to the import of OPML files with sub-categories and + relating to the Whoosh index generation). + +4.8 (2014-04-13) +---------------- + Feeds are now retrieved in a separated process with the Gevent library. + This offers the best performance on Heroku. + +4.7 (2014-04-12) +---------------- + pyAggr3g470r can now be deployed on Heroku or on a traditional server. + Moreover, several users can use an instance of pyAggr3g470r. A platform is + managed by the administrator, a user with specific rights. + +4.6 (2014-02-09) +---------------- + This release introduces the import of OPML files of subscriptions. + Minor improvements have been made to the templates. + +4.5 (2014-01-29) +---------------- + This release introduces a one step installation process with a simple + script. Minor improvements to the feedgetter module have been introduced + (the feed description is now stored in the database). Miscellaneous + improvements to the Jinja templates. Finally, more configuration options + are now offered to the user. + +4.4 (2013-12-27) +---------------- + This version introduces some improvements for the feedgetter module + including automatic retrieval of the URL behind feedproxy.google.com, + and support for configuring the user agent and proxy. Minor improvements + were made to the MongoEngine models. Notifications are displayed with + Flask flash messages. + +4.3 (2013-12-03) +---------------- + With this release, the user is able to update her personal information. + It is now possible to enable/disable the checking of updates for a feed. + Some performance improvements and user interface optimizations have been + done. + +4.2 (2013-11-10) +---------------- + This is the first release of the new version of pyAggr3g470r. + The code has been rewritten with the Flask microframework and the + Bootstrap frontend framework. + +4.1 (2013-08-11) +---------------- + HTTP proxy support has been added for the fetching of feeds. This is + useful, for example, if you are using Privoxy/Tor. + +4.0 (2013-06-25) +---------------- + Searching of articles is now achieved with Whoosh, a fast full-text + indexing and searching library. + +3.9 (2013-04-14) +---------------- + The code has been tested and ported to Python 3.3.1. Some minor bugs have + been fixed, with a lot of improvements concerning the Mako templates, + MongoDB database management, and management of exceptions. + +3.8 (2013-01-12) +---------------- + This release introduces a reworked management page; it is now possible to + change the username and password via this page. + Some improvements concerning the HTML export of the database have been + made. Finally, indexed MongoDB full text searching provides a much faster + search. + +3.7 (2012-12-29) +---------------- + pyAggr3g470r is now using the Mako template library. + Performance improvements for the MongoDB database have been made, and some + minor bugfixes. Stop words (a, of, the, an, for...) are now ignored when + calculating top words for the generation of the tag cloud. + A new page indicates the list of inactive feeds (with no new published + articles since a given number of days). + +3.6 (2012-11-08) +---------------- + pyAggr3g470r is now running with Python 3.2(.3). It uses CherryPy 3.2.2, + BeautifulSoup4, and feedparser3.Your old MongoDB database can be used + without any problem. + +3.5 (2012-10-28) +---------------- + Some minor bugfixes and improvements. + An authentication screen has been added, which is useful if pyAggr3g470r + is running on an EC2 instance, for example. + +3.4 (2012-05-01) +---------------- + This version introduces some minor improvements and bugfixes. + All features of pyAggr3g470r are now back (with MongoDB). + +3.3 (2012-04-16) +---------------- + This version introduces minor improvements and a bugfix. + Publication dates of articles are now stored as a datetime object. + A bug in the script that converts an SQLite database to a MongoDB database + is now fixed. + A little documentation has been added. + +3.2 (2012-03-20) +---------------- + A MongoDB database is now used instead of the SQLite database. This change + offers a significant performance improvement. The database has been tested + with more than 30,000 articles, but version 3.2 is still a test version. + A more stable version will arrive soon. + +3.1 (2011-11-29) +---------------- + A new version of the QR Code module is used. For each article, a QR Code + is generated based on the content of the article. If the article is too + long, only the article's URL is encoded in the QR Code. For a given + string, the algorithm tries the generate as small a QR Code as possible. + Minor bugs were fixed. + +3.0 (2011-10-25) +---------------- + This release introduces exportation of articles to the HTML format and to + the PDF format (there is still exportation to ePub). + The sharing of articles with delicious.com was replaced by pinboard.in.s + +2.9 (2011-08-26) +---------------- + Some minor improvements. A bug with the HTML tag bas been fixed. + Cleanup was done with Pylint. + The test database of pyAggr3g470r contains more than 22000 articles, + and it runs perfectly. + +2.8 (2011-07-08) +---------------- + The feed summary page, which displays general information about a feed, + now lets you change the feed metadata (feed logo, feed name, and feed URL + if changed). Moreover, this page displays the activity of a feed and other + useful information. It is now possible to set a different POD for Diaspora + in the configuration file and to share an article with Google +1. + A control file to start or stop pyAggr3g470r has been added. + From the GUI side, a new transparent CSS tooltip has been introduced in + order to preview an article. + Finally, some minor performance improvements and bugfixes were made. + +2.7 (2011-04-15) +---------------- + Minor improvements. + It is now possible to set a maximum number of articles to be loaded from + the database for each feed (via the management page). + +2.6 (2011-03-21) +---------------- + This version introduces a new page that displays general information about + a feed. There are some minor improvements in the Web interface. + The version of pyAggr3g470r for Python 3 is now fully ready and has been + tested with Python 3.2. + +2.5 (2011-01-19) +---------------- + A bug when removing a feed from the data base was fixed. + Minor improvements were made for export of articles and the size of HTML + forms. + +2.4 (2010-12-07) +---------------- + The GUI uses more HTML 5 features like HTML5 Forms Validation + (email input, URL input), an HTML5 month+year date picker, and a + placeholder. From each article it is possible to access the + following and previous article (and a new main menu with CSS ToolTip). + Articles can now be exported to the EPUB format. Articles loaded from the + SQLite base are now stored in memory in a better data structure. With more + than 10,000 articles, pyAggr3g470r starts in 3 seconds. Finally, email + notifications are now sent with HTML message content and with an + alternative plain text version (MIMEMultipart). + +2.3 (2010-11-15) +---------------- + This version introduces HTML5 Forms Validation and a HTML5 month+year date + picker for the history page, which can be used to search for articles. + This currently only works with Opera. + +2.2 (2010-11-03) +---------------- + There is now a third way to export articles from the SQLite base. + There is an export method for the wiki DokuWiki (example in the commit + message). + +2.1 (2010-10-25) +---------------- + The export of articles to HTML has been updated, with better output. + There are a number of improvements (the search function, generation of + tags cloud, display of article content, CSS, bugfixes, etc.). + There is a new Wiki. + +2.0 (2010-09-03) +---------------- + It is now possible to browse articles by year and month with tag clouds + (see new screenshots). + In addition, URL errors are detected before downloading feeds. + There are some improvements in the user interface. + +1.9 (2010-09-02) +---------------- + The feedgetter module was improved. More details about articles are stored + in the database when possile. An attempt is made to get the whole article + (a_feed['entries'][i].content[j].value), and in the event of failure, + the description/summary is used (a_feed['entries'][i].description). + +1.8 (2010-08-25) +---------------- + It is now easier to install pyAggr3g470r. + There is no longer any need to set any path in the configuration file. + +1.7 (2010-07-23) +---------------- + This release generates QR codes with URLs of articles, so you can read an + article later on your smartphone (or share with friends). + +1.6 (2010-07-08) +---------------- + It is now possible to automatically add a feed (with the URL of the site), + delete an article, delete a feed with all its articles, and to delete all + articles from the database. + There are also some nice improvements to performance, tested with more + than 3000 articles. + Furthermore, HTML export of all the articles of the database was improved. + You can also export the articles in raw text. Finally, some minor bugs + were fixed. + +1.5 (2010-07-05) +---------------- + Now pyAggr3g470r only works with Python 2.7. + OrderedDict objects are used in order to sort the feeds alphabetically in + a simple way. + +1.4 (2010-06-10) +---------------- + It is now possible to remove all articles of a given feed from the SQLite + base via the management page. You can also add a feed just with the URL + of the Web page. The URL of the feed is obtained by parsing the Web page + with the module BeautifulSoup. + +1.3 (2010-05-04) +---------------- + All articles stored in the SQLite database can now be exported as HTML or + raw text via the management page. + +1.2 (2010-04-29) +---------------- + This version introduces a tag cloud with variable word length. + Some improvements were made to the CSS and a bug was fixed. + +1.1 (2010-04-15) +---------------- + Introduction of a Google Buzz button. + It is now possible to mark or unmark articles as favorites. + +1.0 (2010-03-23) +---------------- + The database of feeds is monitored with the Python gamin module, + if present. Otherwise it is done with a classic function. + You now have the option to be informed of new articles by email. To + receive these notifications, just click on "Stay tuned" for the + desired feed(s) at the main page of pyAggr3g470r in the browser. + +0.9 (2010-02-28) +---------------- + TuxDroid tells you when there are unread articles (this module is + independent in case you don't have a TuxDroid). Moreover, the language of + articles is detected (thanks to the oice.langdet Python module). This + allows you to search for articles by language. + +0.8 (2010-02-24) +---------------- + It is now possible to share articles with delicious, Digg, reddit, + Scoopeo, and Blogmarks. + The "Management of feeds" page presents information on the database and + statistics on articles (with a histogram). HTML tags are now skipped for + the search. Some other improvements were made. + +0.7 (2010-02-15) +---------------- + It is now possible to search for an article, through the titles and + descriptions. + +0.6 (2010-02-05) +---------------- + Unread articles are now shown in bold. This was implemented using a new + field in the SQLite database. New tabs for article descriptions are opened + with the _rel=noreferrer_ option in order to separate processes (useful + with Chromium). It is now possible to see only unread articles for each feed. + +0.5 (2010-02-02) +---------------- + It is now possible to fetch feeds manually by clicking on "Fetch all feeds" + and/or with cron. Better navigation between feeds and improvements to the + SQLite database have been added. + +0.4 (2010-02-01) +---------------- + Release 0.4. The main page display only 10 articles by feeds. + For each feeds a page present the list of all articles. The SQLite base is + smaller than before (removed hashed value). + A lot of improvements. + +0.3 (2010-02-01) +---------------- + A new menu was added for faster access to feeds. Some improvements were + made to the CSS. + +0.2 (2010-01-31) +---------------- + Articles are now sorted by date, and it is possible to read just a + description of an article. There are some improvements in the code and + SQLite base management. + +0.1 (2010-01-29) +---------------- + First release of pyAggr3g470r. diff --git a/README.rst b/README.rst index fbfda039..5d9733f8 100644 --- a/README.rst +++ b/README.rst @@ -1,15 +1,14 @@ -++++ +==== JARR -++++ +==== Presentation -============ +------------ -`JARR (Just Another RSS Reader) `_ is a -web-based news aggregator. +`JARR (Just Another RSS Reader) `_ is a web-based news aggregator and reader. Main features -============= +------------- * can be easily deployed on Heroku or on a traditional server; * multiple users can use a JARR instance; @@ -21,37 +20,33 @@ Main features * detection of inactive feeds; * share articles with Google +, Pinboard and reddit. -The core technologies are `Flask `_, -`asyncio `_ and -`SQLAlchemy `_. +The core technologies are `Flask `_, `asyncio `_ and `SQLAlchemy `_. Python 3.5 is recommended. Documentation -============= +------------- A documentation is available `here `_ and provides different ways to install JARR. Internationalization -==================== +-------------------- JARR is translated into English and French. Donation -======== +-------- -If you wish and if you like *JARR*, you can donate via bitcoin -`1GVmhR9fbBeEh7rP1qNq76jWArDdDQ3otZ `_. +If you wish and if you like *JARR*, you can donate via bitcoin `1GVmhR9fbBeEh7rP1qNq76jWArDdDQ3otZ `_. Thank you! License -======= +------- -`JARR `_ -is under the `GNU Affero General Public License version 3 `_. +`JARR `_ is under the `GNU Affero General Public License version 3 `_. Contact -======= +------- `My home page `_. diff --git a/src/crawler.py b/src/crawler.py deleted file mode 100644 index 0598c418..00000000 --- a/src/crawler.py +++ /dev/null @@ -1,168 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 - - -# jarr - A Web based news aggregator. -# Copyright (C) 2010-2015 Cédric Bonhomme - https://www.JARR-aggregator.org -# -# For more information : https://github.com/JARR-aggregator/JARR/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 3.3 $" -__date__ = "$Date: 2010/09/02 $" -__revision__ = "$Date: 2015/12/07 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "AGPLv3" - -import asyncio -import logging -import feedparser -import dateutil.parser -from datetime import datetime -from sqlalchemy import or_ - -import conf -from bootstrap import db -from web.models import User -from web.controllers import FeedController, ArticleController -from web.lib.feed_utils import construct_feed_from, is_parsing_ok -from web.lib.article_utils import construct_article, extract_id - -logger = logging.getLogger(__name__) - -sem = asyncio.Semaphore(5) - -import ssl -try: - _create_unverified_https_context = ssl._create_unverified_context -except AttributeError: - # Legacy Python that doesn't verify HTTPS certificates by default - pass -else: - # Handle target environment that doesn't support HTTPS verification - ssl._create_default_https_context = _create_unverified_https_context - - -async def get(*args, **kwargs): - #kwargs["connector"] = aiohttp.TCPConnector(verify_ssl=False) - try: - data = feedparser.parse(args[0]) - return data - except Exception as e: - raise e - - -async def parse_feed(user, feed): - """ - Fetch a feed. - Update the feed and return the articles. - """ - parsed_feed = None - up_feed = {} - articles = [] - with (await sem): - try: - parsed_feed = await get(feed.link) - except Exception as e: - up_feed['last_error'] = str(e) - up_feed['error_count'] = feed.error_count + 1 - finally: - up_feed['last_retrieved'] = datetime.now(dateutil.tz.tzlocal()) - if parsed_feed is None: - FeedController().update({'id': feed.id}, up_feed) - return - - if not is_parsing_ok(parsed_feed): - up_feed['last_error'] = str(parsed_feed['bozo_exception']) - up_feed['error_count'] = feed.error_count + 1 - FeedController().update({'id': feed.id}, up_feed) - return - if parsed_feed['entries'] != []: - articles = parsed_feed['entries'] - - up_feed['error_count'] = 0 - up_feed['last_error'] = "" - - # Feed informations - construct_feed_from(feed.link, parsed_feed).update(up_feed) - if feed.title and 'title' in up_feed: - # do not override the title set by the user - del up_feed['title'] - FeedController().update({'id': feed.id}, up_feed) - - return articles - - -async def insert_database(user, feed): - - articles = await parse_feed(user, feed) - if None is articles: - return [] - - logger.debug('inserting articles for {}'.format(feed.title)) - - logger.info("Database insertion...") - new_articles = [] - art_contr = ArticleController(user.id) - for article in articles: - exist = art_contr.read(feed_id=feed.id, - **extract_id(article)).count() != 0 - if exist: - logger.debug("Article %r (%r) already in the database.", - article['title'], article['link']) - continue - article = construct_article(article, feed) - try: - new_articles.append(art_contr.create(**article)) - logger.info("New article % (%r) added.", - article['title'], article['link']) - except Exception: - logger.exception("Error when inserting article in database:") - continue - return new_articles - - -async def init_process(user, feed): - # Fetch the feed and insert new articles in the database - articles = await insert_database(user, feed) - logger.debug('inserted articles for %s', feed.title) - return articles - - -def retrieve_feed(loop, user, feed_id=None): - """ - Launch the processus. - """ - logger.info("Starting to retrieve feeds.") - - # Get the list of feeds to fetch - user = User.query.filter(User.email == user.email).first() - feeds = [feed for feed in user.feeds if - feed.error_count <= conf.DEFAULT_MAX_ERROR and feed.enabled] - if feed_id is not None: - feeds = [feed for feed in feeds if feed.id == feed_id] - - if feeds == []: - return - - # Launch the process for all the feeds - tasks = [asyncio.ensure_future(init_process(user, feed)) for feed in feeds] - - try: - loop.run_until_complete(asyncio.wait(tasks)) - except Exception: - logger.exception('an error occured') - - logger.info("All articles retrieved. End of the processus.") diff --git a/src/crawler/classic_crawler.py b/src/crawler/classic_crawler.py new file mode 100644 index 00000000..0598c418 --- /dev/null +++ b/src/crawler/classic_crawler.py @@ -0,0 +1,168 @@ +#! /usr/bin/env python +# -*- coding: utf-8 - + +# jarr - A Web based news aggregator. +# Copyright (C) 2010-2015 Cédric Bonhomme - https://www.JARR-aggregator.org +# +# For more information : https://github.com/JARR-aggregator/JARR/ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 3.3 $" +__date__ = "$Date: 2010/09/02 $" +__revision__ = "$Date: 2015/12/07 $" +__copyright__ = "Copyright (c) Cedric Bonhomme" +__license__ = "AGPLv3" + +import asyncio +import logging +import feedparser +import dateutil.parser +from datetime import datetime +from sqlalchemy import or_ + +import conf +from bootstrap import db +from web.models import User +from web.controllers import FeedController, ArticleController +from web.lib.feed_utils import construct_feed_from, is_parsing_ok +from web.lib.article_utils import construct_article, extract_id + +logger = logging.getLogger(__name__) + +sem = asyncio.Semaphore(5) + +import ssl +try: + _create_unverified_https_context = ssl._create_unverified_context +except AttributeError: + # Legacy Python that doesn't verify HTTPS certificates by default + pass +else: + # Handle target environment that doesn't support HTTPS verification + ssl._create_default_https_context = _create_unverified_https_context + + +async def get(*args, **kwargs): + #kwargs["connector"] = aiohttp.TCPConnector(verify_ssl=False) + try: + data = feedparser.parse(args[0]) + return data + except Exception as e: + raise e + + +async def parse_feed(user, feed): + """ + Fetch a feed. + Update the feed and return the articles. + """ + parsed_feed = None + up_feed = {} + articles = [] + with (await sem): + try: + parsed_feed = await get(feed.link) + except Exception as e: + up_feed['last_error'] = str(e) + up_feed['error_count'] = feed.error_count + 1 + finally: + up_feed['last_retrieved'] = datetime.now(dateutil.tz.tzlocal()) + if parsed_feed is None: + FeedController().update({'id': feed.id}, up_feed) + return + + if not is_parsing_ok(parsed_feed): + up_feed['last_error'] = str(parsed_feed['bozo_exception']) + up_feed['error_count'] = feed.error_count + 1 + FeedController().update({'id': feed.id}, up_feed) + return + if parsed_feed['entries'] != []: + articles = parsed_feed['entries'] + + up_feed['error_count'] = 0 + up_feed['last_error'] = "" + + # Feed informations + construct_feed_from(feed.link, parsed_feed).update(up_feed) + if feed.title and 'title' in up_feed: + # do not override the title set by the user + del up_feed['title'] + FeedController().update({'id': feed.id}, up_feed) + + return articles + + +async def insert_database(user, feed): + + articles = await parse_feed(user, feed) + if None is articles: + return [] + + logger.debug('inserting articles for {}'.format(feed.title)) + + logger.info("Database insertion...") + new_articles = [] + art_contr = ArticleController(user.id) + for article in articles: + exist = art_contr.read(feed_id=feed.id, + **extract_id(article)).count() != 0 + if exist: + logger.debug("Article %r (%r) already in the database.", + article['title'], article['link']) + continue + article = construct_article(article, feed) + try: + new_articles.append(art_contr.create(**article)) + logger.info("New article % (%r) added.", + article['title'], article['link']) + except Exception: + logger.exception("Error when inserting article in database:") + continue + return new_articles + + +async def init_process(user, feed): + # Fetch the feed and insert new articles in the database + articles = await insert_database(user, feed) + logger.debug('inserted articles for %s', feed.title) + return articles + + +def retrieve_feed(loop, user, feed_id=None): + """ + Launch the processus. + """ + logger.info("Starting to retrieve feeds.") + + # Get the list of feeds to fetch + user = User.query.filter(User.email == user.email).first() + feeds = [feed for feed in user.feeds if + feed.error_count <= conf.DEFAULT_MAX_ERROR and feed.enabled] + if feed_id is not None: + feeds = [feed for feed in feeds if feed.id == feed_id] + + if feeds == []: + return + + # Launch the process for all the feeds + tasks = [asyncio.ensure_future(init_process(user, feed)) for feed in feeds] + + try: + loop.run_until_complete(asyncio.wait(tasks)) + except Exception: + logger.exception('an error occured') + + logger.info("All articles retrieved. End of the processus.") diff --git a/src/crawler/http_crawler.py b/src/crawler/http_crawler.py new file mode 100644 index 00000000..f480fe96 --- /dev/null +++ b/src/crawler/http_crawler.py @@ -0,0 +1,251 @@ +""" +Here's a sum up on how it works : + +CrawlerScheduler.run + will retreive a list of feeds to be refreshed and pass result to +CrawlerScheduler.callback + which will retreive each feed and treat result with +FeedCrawler.callback + which will interprete the result (status_code, etag) collect ids + and match them agaisnt pyagg which will cause +PyAggUpdater.callback + to create the missing entries +""" + +import time +import conf +import json +import logging +import feedparser +from datetime import datetime, timedelta +from time import strftime, gmtime +from concurrent.futures import ThreadPoolExecutor +from requests_futures.sessions import FuturesSession +from web.lib.utils import default_handler, to_hash +from web.lib.feed_utils import construct_feed_from +from web.lib.article_utils import extract_id, construct_article + +logger = logging.getLogger(__name__) +logging.captureWarnings(True) +API_ROOT = "api/v2.0/" + + +class AbstractCrawler: + + def __init__(self, auth, pool=None, session=None): + self.auth = auth + self.pool = pool or ThreadPoolExecutor(max_workers=conf.NB_WORKER) + self.session = session or FuturesSession(executor=self.pool) + self.session.verify = False + self.url = conf.PLATFORM_URL + + def query_pyagg(self, method, urn, data=None): + """A wrapper for internal call, method should be ones you can find + on requests (header, post, get, options, ...), urn the distant + resources you want to access on pyagg, and data, the data you wanna + transmit.""" + if data is None: + data = {} + method = getattr(self.session, method) + return method("%s%s%s" % (self.url, API_ROOT, urn), + auth=self.auth, data=json.dumps(data, + default=default_handler), + headers={'Content-Type': 'application/json', + 'User-Agent': conf.USER_AGENT}) + + def wait(self, max_wait=300, checks=5, wait_for=2): + checked, second_waited = 0, 0 + while True: + time.sleep(wait_for) + second_waited += wait_for + if second_waited > max_wait: + logger.warn('Exiting after %d seconds', second_waited) + break + if self.pool._work_queue.qsize(): + checked = 0 + continue + checked += 1 + if checked == checks: + break + + +class PyAggUpdater(AbstractCrawler): + + def __init__(self, feed, entries, headers, parsed_feed, + auth, pool=None, session=None): + self.feed = feed + self.entries = entries + self.headers = headers + self.parsed_feed = parsed_feed + super().__init__(auth, pool, session) + + def callback(self, response): + """Will process the result from the challenge, creating missing article + and updating the feed""" + article_created = False + if response.result().status_code != 204: + results = response.result().json() + logger.debug('%r %r - %d entries were not matched ' + 'and will be created', + self.feed['id'], self.feed['title'], len(results)) + for id_to_create in results: + article_created = True + entry = construct_article( + self.entries[tuple(sorted(id_to_create.items()))], + self.feed) + logger.info('%r %r - creating %r for %r - %r', self.feed['id'], + self.feed['title'], entry['title'], + entry['user_id'], id_to_create) + self.query_pyagg('post', 'article', entry) + + logger.debug('%r %r - updating feed etag %r last_mod %r', + self.feed['id'], self.feed['title'], + self.headers.get('etag', ''), + self.headers.get('last-modified', '')) + + up_feed = {'error_count': 0, 'last_error': None, + 'etag': self.headers.get('etag', ''), + 'last_modified': self.headers.get('last-modified', + strftime('%a, %d %b %Y %X %Z', gmtime()))} + fresh_feed = construct_feed_from(url=self.feed['link'], + fp_parsed=self.parsed_feed) + for key in ('description', 'site_link', 'icon_url'): + if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key): + up_feed[key] = fresh_feed[key] + if not self.feed.get('title'): + up_feed['title'] = fresh_feed.get('title', '') + up_feed['user_id'] = self.feed['user_id'] + # re-getting that feed earlier since new entries appeared + if article_created: + up_feed['last_retrieved'] \ + = (datetime.now() - timedelta(minutes=45)).isoformat() + + diff_keys = {key for key in up_feed + if up_feed[key] != self.feed.get(key)} + if not diff_keys: + return # no change in the feed, no update + if not article_created and diff_keys == {'last_modified', 'etag'}: + return # meaningless if no new article has been published + logger.info('%r %r - pushing feed attrs %r', + self.feed['id'], self.feed['title'], + {key: "%s -> %s" % (up_feed[key], self.feed.get(key)) + for key in up_feed if up_feed[key] != self.feed.get(key)}) + + self.query_pyagg('put', 'feed/%d' % self.feed['id'], up_feed) + + +class FeedCrawler(AbstractCrawler): + + def __init__(self, feed, auth, pool=None, session=None): + self.feed = feed + super().__init__(auth, pool, session) + + def clean_feed(self): + """Will reset the errors counters on a feed that have known errors""" + if self.feed.get('error_count') or self.feed.get('last_error'): + self.query_pyagg('put', 'feed/%d' % self.feed['id'], + {'error_count': 0, 'last_error': ''}) + + def callback(self, response): + """will fetch the feed and interprete results (304, etag) or will + challenge pyagg to compare gotten entries with existing ones""" + try: + response = response.result() + response.raise_for_status() + except Exception as error: + error_count = self.feed['error_count'] + 1 + logger.exception('%r %r - an error occured while fetching ' + 'feed; bumping error count to %r', + self.feed['id'], self.feed['title'], error_count) + future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], + {'error_count': error_count, + 'last_error': str(error), + 'user_id': self.feed['user_id']}) + return + + if response.status_code == 304: + logger.info("%r %r - feed responded with 304", + self.feed['id'], self.feed['title']) + self.clean_feed() + return + if 'etag' not in response.headers: + logger.debug('%r %r - manually generating etag', + self.feed['id'], self.feed['title']) + response.headers['etag'] = 'pyagg/"%s"' % to_hash(response.text) + if response.headers['etag'] and self.feed['etag'] \ + and response.headers['etag'] == self.feed['etag']: + if 'pyagg' in self.feed['etag']: + logger.info("%r %r - calculated hash matches (%d)", + self.feed['id'], self.feed['title'], + response.status_code) + else: + logger.info("%r %r - feed responded with same etag (%d)", + self.feed['id'], self.feed['title'], + response.status_code) + self.clean_feed() + return + else: + logger.debug('%r %r - etag mismatch %r != %r', + self.feed['id'], self.feed['title'], + response.headers['etag'], self.feed['etag']) + logger.info('%r %r - cache validation failed, challenging entries', + self.feed['id'], self.feed['title']) + + ids, entries = [], {} + parsed_response = feedparser.parse(response.content) + for entry in parsed_response['entries']: + entry_ids = extract_id(entry) + entry_ids['feed_id'] = self.feed['id'] + entry_ids['user_id'] = self.feed['user_id'] + entries[tuple(sorted(entry_ids.items()))] = entry + ids.append(entry_ids) + logger.debug('%r %r - found %d entries %r', + self.feed['id'], self.feed['title'], len(ids), ids) + future = self.query_pyagg('get', 'articles/challenge', {'ids': ids}) + updater = PyAggUpdater(self.feed, entries, response.headers, + parsed_response, + self.auth, self.pool, self.session) + future.add_done_callback(updater.callback) + + +class CrawlerScheduler(AbstractCrawler): + + def __init__(self, username, password, pool=None, session=None): + self.auth = (username, password) + super(CrawlerScheduler, self).__init__(self.auth, pool, session) + + def prepare_headers(self, feed): + """For a known feed, will construct some header dictionnary""" + headers = {'User-Agent': conf.USER_AGENT} + if feed.get('last_modified'): + headers['If-Modified-Since'] = feed['last_modified'] + if feed.get('etag') and 'pyagg' not in feed['etag']: + headers['If-None-Match'] = feed['etag'] + logger.debug('%r %r - calculated headers %r', + feed['id'], feed['title'], headers) + return headers + + def callback(self, response): + """processes feeds that need to be fetched""" + response = response.result() + response.raise_for_status() + if response.status_code == 204: + logger.debug("No feed to fetch") + return + feeds = response.json() + logger.debug('%d to fetch %r', len(feeds), feeds) + for feed in feeds: + logger.debug('%r %r - fetching resources', + feed['id'], feed['title']) + future = self.session.get(feed['link'], + headers=self.prepare_headers(feed)) + + feed_crwlr = FeedCrawler(feed, self.auth, self.pool, self.session) + future.add_done_callback(feed_crwlr.callback) + + def run(self, **kwargs): + """entry point, will retreive feeds to be fetch + and launch the whole thing""" + logger.debug('retreving fetchable feed') + future = self.query_pyagg('get', 'feeds/fetchable', kwargs) + future.add_done_callback(self.callback) diff --git a/src/manager.py b/src/manager.py index f7240670..781d742b 100755 --- a/src/manager.py +++ b/src/manager.py @@ -32,7 +32,7 @@ def db_create(): @manager.command def fetch(limit=100, retreive_all=False): "Crawl the feeds with the client crawler." - from web.lib.crawler import CrawlerScheduler + from crawler.http_crawler import CrawlerScheduler scheduler = CrawlerScheduler(conf.API_LOGIN, conf.API_PASSWD) scheduler.run(limit=limit, retreive_all=retreive_all) scheduler.wait() @@ -47,7 +47,7 @@ def fetch_asyncio(user_id, feed_id): populate_g() from flask import g from web.models import User - import crawler + from crawler import classic_crawler users = [] try: users = User.query.filter(User.id == int(user_id)).all() @@ -67,7 +67,7 @@ def fetch_asyncio(user_id, feed_id): if user.activation_key == "": print("Fetching articles for " + user.nickname) g.user = user - crawler.retrieve_feed(loop, g.user, feed_id) + classic_crawler.retrieve_feed(loop, g.user, feed_id) loop.close() from scripts.probes import ArticleProbe, FeedProbe diff --git a/src/web/lib/crawler.py b/src/web/lib/crawler.py deleted file mode 100644 index f480fe96..00000000 --- a/src/web/lib/crawler.py +++ /dev/null @@ -1,251 +0,0 @@ -""" -Here's a sum up on how it works : - -CrawlerScheduler.run - will retreive a list of feeds to be refreshed and pass result to -CrawlerScheduler.callback - which will retreive each feed and treat result with -FeedCrawler.callback - which will interprete the result (status_code, etag) collect ids - and match them agaisnt pyagg which will cause -PyAggUpdater.callback - to create the missing entries -""" - -import time -import conf -import json -import logging -import feedparser -from datetime import datetime, timedelta -from time import strftime, gmtime -from concurrent.futures import ThreadPoolExecutor -from requests_futures.sessions import FuturesSession -from web.lib.utils import default_handler, to_hash -from web.lib.feed_utils import construct_feed_from -from web.lib.article_utils import extract_id, construct_article - -logger = logging.getLogger(__name__) -logging.captureWarnings(True) -API_ROOT = "api/v2.0/" - - -class AbstractCrawler: - - def __init__(self, auth, pool=None, session=None): - self.auth = auth - self.pool = pool or ThreadPoolExecutor(max_workers=conf.NB_WORKER) - self.session = session or FuturesSession(executor=self.pool) - self.session.verify = False - self.url = conf.PLATFORM_URL - - def query_pyagg(self, method, urn, data=None): - """A wrapper for internal call, method should be ones you can find - on requests (header, post, get, options, ...), urn the distant - resources you want to access on pyagg, and data, the data you wanna - transmit.""" - if data is None: - data = {} - method = getattr(self.session, method) - return method("%s%s%s" % (self.url, API_ROOT, urn), - auth=self.auth, data=json.dumps(data, - default=default_handler), - headers={'Content-Type': 'application/json', - 'User-Agent': conf.USER_AGENT}) - - def wait(self, max_wait=300, checks=5, wait_for=2): - checked, second_waited = 0, 0 - while True: - time.sleep(wait_for) - second_waited += wait_for - if second_waited > max_wait: - logger.warn('Exiting after %d seconds', second_waited) - break - if self.pool._work_queue.qsize(): - checked = 0 - continue - checked += 1 - if checked == checks: - break - - -class PyAggUpdater(AbstractCrawler): - - def __init__(self, feed, entries, headers, parsed_feed, - auth, pool=None, session=None): - self.feed = feed - self.entries = entries - self.headers = headers - self.parsed_feed = parsed_feed - super().__init__(auth, pool, session) - - def callback(self, response): - """Will process the result from the challenge, creating missing article - and updating the feed""" - article_created = False - if response.result().status_code != 204: - results = response.result().json() - logger.debug('%r %r - %d entries were not matched ' - 'and will be created', - self.feed['id'], self.feed['title'], len(results)) - for id_to_create in results: - article_created = True - entry = construct_article( - self.entries[tuple(sorted(id_to_create.items()))], - self.feed) - logger.info('%r %r - creating %r for %r - %r', self.feed['id'], - self.feed['title'], entry['title'], - entry['user_id'], id_to_create) - self.query_pyagg('post', 'article', entry) - - logger.debug('%r %r - updating feed etag %r last_mod %r', - self.feed['id'], self.feed['title'], - self.headers.get('etag', ''), - self.headers.get('last-modified', '')) - - up_feed = {'error_count': 0, 'last_error': None, - 'etag': self.headers.get('etag', ''), - 'last_modified': self.headers.get('last-modified', - strftime('%a, %d %b %Y %X %Z', gmtime()))} - fresh_feed = construct_feed_from(url=self.feed['link'], - fp_parsed=self.parsed_feed) - for key in ('description', 'site_link', 'icon_url'): - if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key): - up_feed[key] = fresh_feed[key] - if not self.feed.get('title'): - up_feed['title'] = fresh_feed.get('title', '') - up_feed['user_id'] = self.feed['user_id'] - # re-getting that feed earlier since new entries appeared - if article_created: - up_feed['last_retrieved'] \ - = (datetime.now() - timedelta(minutes=45)).isoformat() - - diff_keys = {key for key in up_feed - if up_feed[key] != self.feed.get(key)} - if not diff_keys: - return # no change in the feed, no update - if not article_created and diff_keys == {'last_modified', 'etag'}: - return # meaningless if no new article has been published - logger.info('%r %r - pushing feed attrs %r', - self.feed['id'], self.feed['title'], - {key: "%s -> %s" % (up_feed[key], self.feed.get(key)) - for key in up_feed if up_feed[key] != self.feed.get(key)}) - - self.query_pyagg('put', 'feed/%d' % self.feed['id'], up_feed) - - -class FeedCrawler(AbstractCrawler): - - def __init__(self, feed, auth, pool=None, session=None): - self.feed = feed - super().__init__(auth, pool, session) - - def clean_feed(self): - """Will reset the errors counters on a feed that have known errors""" - if self.feed.get('error_count') or self.feed.get('last_error'): - self.query_pyagg('put', 'feed/%d' % self.feed['id'], - {'error_count': 0, 'last_error': ''}) - - def callback(self, response): - """will fetch the feed and interprete results (304, etag) or will - challenge pyagg to compare gotten entries with existing ones""" - try: - response = response.result() - response.raise_for_status() - except Exception as error: - error_count = self.feed['error_count'] + 1 - logger.exception('%r %r - an error occured while fetching ' - 'feed; bumping error count to %r', - self.feed['id'], self.feed['title'], error_count) - future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], - {'error_count': error_count, - 'last_error': str(error), - 'user_id': self.feed['user_id']}) - return - - if response.status_code == 304: - logger.info("%r %r - feed responded with 304", - self.feed['id'], self.feed['title']) - self.clean_feed() - return - if 'etag' not in response.headers: - logger.debug('%r %r - manually generating etag', - self.feed['id'], self.feed['title']) - response.headers['etag'] = 'pyagg/"%s"' % to_hash(response.text) - if response.headers['etag'] and self.feed['etag'] \ - and response.headers['etag'] == self.feed['etag']: - if 'pyagg' in self.feed['etag']: - logger.info("%r %r - calculated hash matches (%d)", - self.feed['id'], self.feed['title'], - response.status_code) - else: - logger.info("%r %r - feed responded with same etag (%d)", - self.feed['id'], self.feed['title'], - response.status_code) - self.clean_feed() - return - else: - logger.debug('%r %r - etag mismatch %r != %r', - self.feed['id'], self.feed['title'], - response.headers['etag'], self.feed['etag']) - logger.info('%r %r - cache validation failed, challenging entries', - self.feed['id'], self.feed['title']) - - ids, entries = [], {} - parsed_response = feedparser.parse(response.content) - for entry in parsed_response['entries']: - entry_ids = extract_id(entry) - entry_ids['feed_id'] = self.feed['id'] - entry_ids['user_id'] = self.feed['user_id'] - entries[tuple(sorted(entry_ids.items()))] = entry - ids.append(entry_ids) - logger.debug('%r %r - found %d entries %r', - self.feed['id'], self.feed['title'], len(ids), ids) - future = self.query_pyagg('get', 'articles/challenge', {'ids': ids}) - updater = PyAggUpdater(self.feed, entries, response.headers, - parsed_response, - self.auth, self.pool, self.session) - future.add_done_callback(updater.callback) - - -class CrawlerScheduler(AbstractCrawler): - - def __init__(self, username, password, pool=None, session=None): - self.auth = (username, password) - super(CrawlerScheduler, self).__init__(self.auth, pool, session) - - def prepare_headers(self, feed): - """For a known feed, will construct some header dictionnary""" - headers = {'User-Agent': conf.USER_AGENT} - if feed.get('last_modified'): - headers['If-Modified-Since'] = feed['last_modified'] - if feed.get('etag') and 'pyagg' not in feed['etag']: - headers['If-None-Match'] = feed['etag'] - logger.debug('%r %r - calculated headers %r', - feed['id'], feed['title'], headers) - return headers - - def callback(self, response): - """processes feeds that need to be fetched""" - response = response.result() - response.raise_for_status() - if response.status_code == 204: - logger.debug("No feed to fetch") - return - feeds = response.json() - logger.debug('%d to fetch %r', len(feeds), feeds) - for feed in feeds: - logger.debug('%r %r - fetching resources', - feed['id'], feed['title']) - future = self.session.get(feed['link'], - headers=self.prepare_headers(feed)) - - feed_crwlr = FeedCrawler(feed, self.auth, self.pool, self.session) - future.add_done_callback(feed_crwlr.callback) - - def run(self, **kwargs): - """entry point, will retreive feeds to be fetch - and launch the whole thing""" - logger.debug('retreving fetchable feed') - future = self.query_pyagg('get', 'feeds/fetchable', kwargs) - future.add_done_callback(self.callback) diff --git a/src/web/utils.py b/src/web/utils.py index fcd791e8..1d4b30ab 100755 --- a/src/web/utils.py +++ b/src/web/utils.py @@ -109,8 +109,8 @@ def fetch(id, feed_id=None): Fetch the feeds in a new processus. The "asyncio" crawler is launched with the manager. """ - cmd = [sys.executable, conf.BASE_DIR+'/manager.py', 'fetch_asyncio', str(id), - str(feed_id)] + cmd = [sys.executable, conf.BASE_DIR + '/manager.py', 'fetch_asyncio', + str(id), str(feed_id)] return subprocess.Popen(cmd, stdout=subprocess.PIPE) def history(user_id, year=None, month=None): -- cgit