diff options
author | Cédric Bonhomme <kimble.mandel@gmail.com> | 2012-11-28 11:39:23 +0100 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel@gmail.com> | 2012-11-28 11:39:23 +0100 |
commit | 84a79ec06541c7db92af48b43d1d4d379cded730 (patch) | |
tree | acbaa6aa38153717d6cf360e519325e56f054492 /source | |
parent | Fix: number of feeds wan no longer displayed in the navigation bar. (diff) | |
download | newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.tar.gz newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.tar.bz2 newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.zip |
Ignore stop words when calculating top words.
Diffstat (limited to 'source')
-rwxr-xr-x | source/utils.py | 17 | ||||
-rw-r--r-- | source/var/english-stop-words.txt | 311 | ||||
-rw-r--r-- | source/var/french-stop-words.txt | 176 | ||||
-rwxr-xr-x | source/var/generate-top-words-list.sh | 8 | ||||
-rw-r--r-- | source/var/stop_words/english-stop-words-list.txt | 1 | ||||
-rw-r--r-- | source/var/stop_words/french-stop-words-list.txt | 1 |
6 files changed, 513 insertions, 1 deletions
diff --git a/source/utils.py b/source/utils.py index 7681fea7..475f3c06 100755 --- a/source/utils.py +++ b/source/utils.py @@ -36,6 +36,7 @@ __license__ = "GPLv3" import os import re +import glob import operator import urllib.parse import calendar @@ -139,15 +140,29 @@ def normalize_filename(name): file_name = strip_accents(file_name, "utf-8") return os.path.normpath(file_name) +def load_stop_words(): + """ + Load the stop words and return them in a list. + """ + stop_words_lists = glob.glob('./var/stop_words/*.txt') + stop_words = [] + + for stop_wods_list in stop_words_lists: + with open(stop_wods_list, "r") as stop_wods_file: + stop_words += stop_wods_file.read().split(";") + return stop_words + def top_words(articles, n=10, size=5): """ Return the n most frequent words in a list. """ + stop_words = load_stop_words() words = Counter() wordre = re.compile(r'\b\w{%s,}\b' % size, re.I) for article in articles: for word in wordre.findall(clear_string(article["article_content"])): - words[word.lower()] += 1 + if word.lower() not in stop_words: + words[word.lower()] += 1 return words.most_common(n) def tag_cloud(tags, query="word_count"): diff --git a/source/var/english-stop-words.txt b/source/var/english-stop-words.txt new file mode 100644 index 00000000..497a1f96 --- /dev/null +++ b/source/var/english-stop-words.txt @@ -0,0 +1,311 @@ + + | An English stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | Many of the forms below are quite rare (e.g. "yourselves") but included for + | completeness. + + | PRONOUNS FORMS + | 1st person sing + +i | subject, always in upper case of course + +me | object +my | possessive adjective + | the possessive pronoun `mine' is best suppressed, because of the + | sense of coal-mine etc. +myself | reflexive + | 1st person plural +we | subject + +| us | object + | care is required here because US = United States. It is usually + | safe to remove it if it is in lower case. +our | possessive adjective +ours | possessive pronoun +ourselves | reflexive + | second person (archaic `thou' forms not included) +you | subject and object +your | possessive adjective +yours | possessive pronoun +yourself | reflexive (singular) +yourselves | reflexive (plural) + | third person singular +he | subject +him | object +his | possessive adjective and pronoun +himself | reflexive + +she | subject +her | object and possessive adjective +hers | possessive pronoun +herself | reflexive + +it | subject and object +its | possessive adjective +itself | reflexive + | third person plural +they | subject +them | object +their | possessive adjective +theirs | possessive pronoun +themselves | reflexive + | other forms (demonstratives, interrogatives) +what +which +who +whom +this +that +these +those + + | VERB FORMS (using F.R. Palmer's nomenclature) + | BE +am | 1st person, present +is | -s form (3rd person, present) +are | present +was | 1st person, past +were | past +be | infinitive +been | past participle +being | -ing form + | HAVE +have | simple +has | -s form +had | past +having | -ing form + | DO +do | simple +does | -s form +did | past +doing | -ing form + + | The forms below are, I believe, best omitted, because of the significant + | homonym forms: + + | He made a WILL + | old tin CAN + | merry month of MAY + | a smell of MUST + | fight the good fight with all thy MIGHT + + | would, could, should, ought might however be included + + | | AUXILIARIES + | | WILL + |will + +would + + | | SHALL + |shall + +should + + | | CAN + |can + +could + + | | MAY + |may + |might + | | MUST + |must + | | OUGHT + +ought + + | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing + | pronoun + verb + +i'm +you're +he's +she's +it's +we're +they're +i've +you've +we've +they've +i'd +you'd +he'd +she'd +we'd +they'd +i'll +you'll +he'll +she'll +we'll +they'll + + | verb + negation + +isn't +aren't +wasn't +weren't +hasn't +haven't +hadn't +doesn't +don't +didn't + + | auxiliary + negation + +won't +wouldn't +shan't +shouldn't +can't +cannot +couldn't +mustn't + + | miscellaneous forms + +let's +that's +who's +what's +here's +there's +when's +where's +why's +how's + + | rarer forms + + | daren't needn't + + | doubtful forms + + | oughtn't mightn't + + | ARTICLES +a +an +the + + | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so + | high, that classification is pointless.) +and +but +if +or +because +as +until +while + +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under + +again +further +then +once + +here +there +when +where +why +how + +all +any +both +each +few +more +most +other +some +such + +no +nor +not +only +own +same +so +than +too +very + + | Just for the record, the following words are among the commonest in English + + | one + | every + | least + | less + | many + | now + | ever + | never + | say + | says + | said + | also + | get + | go + | goes + | just + | made + | make + | put + | see + | seen + | whether + | like + | well + | back + | even + | still + | way + | take + | since + | another + | however + | two + | three + | four + | five + | first + | second + | new + | old + | high + | long
\ No newline at end of file diff --git a/source/var/french-stop-words.txt b/source/var/french-stop-words.txt new file mode 100644 index 00000000..08a2f5d7 --- /dev/null +++ b/source/var/french-stop-words.txt @@ -0,0 +1,176 @@ + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) +son | his, her (masc) +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +celà | that +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself
\ No newline at end of file diff --git a/source/var/generate-top-words-list.sh b/source/var/generate-top-words-list.sh new file mode 100755 index 00000000..2a87e147 --- /dev/null +++ b/source/var/generate-top-words-list.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +if test $# != 2 ; then + echo No input files given 1>&2 + exit 1 +fi + +awk 'BEGIN{FS = " "} { if ($1 ~ /^[A-Za-z]/) {print $1}}' $1 | sort | tr '\n' ';' > $2
\ No newline at end of file diff --git a/source/var/stop_words/english-stop-words-list.txt b/source/var/stop_words/english-stop-words-list.txt new file mode 100644 index 00000000..caa26aaf --- /dev/null +++ b/source/var/stop_words/english-stop-words-list.txt @@ -0,0 +1 @@ +a;about;above;after;again;against;all;am;an;and;any;are;aren't;as;at;be;because;been;before;being;below;between;both;but;by;cannot;can't;could;couldn't;did;didn't;do;does;doesn't;doing;don't;down;during;each;few;for;from;further;had;hadn't;has;hasn't;have;haven't;having;he;he'd;he'll;her;here;here's;hers;herself;he's;him;himself;his;how;how's;i;i'd;if;i'll;i'm;in;into;is;isn't;it;its;it's;itself;i've;let's;me;more;most;mustn't;my;myself;no;nor;not;of;off;on;once;only;or;other;ought;our;ours;ourselves;out;over;own;same;shan't;she;she'd;she'll;she's;should;shouldn't;slashdot;so;some;such;than;that;that's;the;their;theirs;them;themselves;then;there;there's;these;they;they'd;they'll;they're;they've;this;those;through;to;too;under;until;up;very;was;wasn't;we;we'd;we'll;were;we're;weren't;we've;what;what's;when;when's;where;where's;which;while;who;whom;who's;why;why's;with;won't;would;wouldn't;writes;you;you'd;you'll;your;you're;yours;yourself;yourselves;you've; diff --git a/source/var/stop_words/french-stop-words-list.txt b/source/var/stop_words/french-stop-words-list.txt new file mode 100644 index 00000000..a6a36c79 --- /dev/null +++ b/source/var/stop_words/french-stop-words-list.txt @@ -0,0 +1 @@ +à;ai;aie;aient;aies;ait;as;au;aura;aurai;auraient;aurais;aurait;auras;aurez;auriez;aurions;aurons;auront;aux;avaient;avais;avait;avec;avez;aviez;avions;avons;ayant;ayez;ayons;c;ce;ceci;celà;ces;cet;cette;d;dans;de;des;du;elle;en;es;est;et;étaient;étais;était;étant;été;étée;étées;êtes;étés;étiez;étions;eu;eue;eues;eûmes;eurent;eus;eusse;eussent;eusses;eussiez;eussions;eut;eût;eûtes;eux;fûmes;furent;fus;fusse;fussent;fusses;fussiez;fussions;fut;fût;fûtes;ici;il;ils;j;je;l;la;le;les;leur;leurs;lui;m;ma;mais;me;même;mes;moi;mon;n;ne;nos;notre;nous;on;ont;ou;par;pas;pour;qu;que;quel;quelle;quelles;quels;qui;s;sa;sans;se;sera;serai;seraient;serais;serait;seras;serez;seriez;serions;serons;seront;ses;soi;soient;sois;soit;sommes;son;sont;soyez;soyons;suis;sur;t;ta;te;tes;toi;ton;tu;toujours;un;une;vos;votre;vous;y; |