diff options
-rw-r--r-- | fuzzycat/cluster.py | 13 | ||||
-rw-r--r-- | fuzzycat/main.py | 6 | ||||
-rw-r--r-- | fuzzycat/sandcrawler-title-denylist.txt | 559 |
3 files changed, 574 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index e4a36bf..dde0688 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -246,6 +246,7 @@ class Cluster: output=sys.stdout, keyfunc=lambda v: v, prefix='fuzzycat-', + key_denylist=None, tmpdir=None): """ Files can be a list of files or "-" for stdin. @@ -256,6 +257,7 @@ class Cluster: self.prefix = prefix self.tmpdir = tmpdir self.logger = logging.getLogger('fuzzycat.cluster') + self.key_denylist = key_denylist def run(self): """ @@ -266,12 +268,15 @@ class Cluster: with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf: for line in fileinput.input(files=self.files): try: - id, key = keyfunc(json.loads(line)) - print("{}\t{}".format(id, key), file=tf) + ident, key = keyfunc(json.loads(line)) except (KeyError, ValueError): counter["key_extraction_failed"] += 1 - else: - counter["key_ok"] += 1 + continue + if self.key_denylist and key in self.key_denylist: + counter["key_denylist"] += 1 + continue + counter["key_ok"] += 1 + print("{}\t{}".format(ident, key), file=tf) sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir) with open(sbc) as f: comment = keyfunc.__name__ diff --git a/fuzzycat/main.py b/fuzzycat/main.py index c7ba23d..bfce68e 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -33,9 +33,14 @@ def run_cluster(args): 'tnysi': release_key_title_nysiis, 'tss': release_key_title_ngram, } + key_denylist = None + if args.key_denylist: + with open(args.key_denylist, 'r') as f: + key_denylist = [l.strip() for l in f.readlines()] cluster = Cluster(files=args.files, keyfunc=types.get(args.type), tmpdir=args.tmpdir, + key_denylist=key_denylist, prefix=args.prefix) stats = cluster.run() logger.debug(json.dumps(dict(stats))) @@ -83,6 +88,7 @@ if __name__ == '__main__': sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser]) sub_cluster.set_defaults(func=run_cluster) sub_cluster.add_argument('-f', '--files', default="-", help='input files') + sub_cluster.add_argument('--key-denylist', help='file path to key denylist') sub_cluster.add_argument('-t', '--type', default='title', diff --git a/fuzzycat/sandcrawler-title-denylist.txt b/fuzzycat/sandcrawler-title-denylist.txt new file mode 100644 index 0000000..ef575b4 --- /dev/null +++ b/fuzzycat/sandcrawler-title-denylist.txt @@ -0,0 +1,559 @@ +abbreviations +abbreviationsandacronyms +aboutauthors +abouttheauthor +abouttheauthors +aboutthecover +abouttheeditors +abreviations +abstract +abstractnotsubmittedforonlinepublication +abstractoriginalarticle +abstracts +abstractsofaapaposterandpodiumpresentations +abstractsofcommunications +abstractsofthesesfromthescandinaviancountries +abstractwithdrawn +acknowledgement +acknowledgements +acknowledgementsvii +acknowledgementtoreferees +acknowledgementtoreviewers +acknowledgment +acknowledgmentofreferees +acknowledgments +addendum +additionalresources +address +advertisement +advertisersindex +affect +affiliation +afterword +agenda +agradecimentos +agradecimientos +aimsandscope +analysis +annexa +announcement +announcements +annualacknowledgementofmanuscriptreviewers +anotefromtheeditor +appendices +appendix +appendix1 +appendixa +appendixb +appointmentsandstaffchanges +approximation +apresentacao +article +articlenumber +articles +articlesofsignificantinterestselectedfromthisissuebytheeditors +associationnews +ataglance +atribute +attention +authorguidelines +authorindex +authorindexforvolume81 +authorreply +authors +authorsreply +authorsresponse +avantpropos +award +awardsappointmentsannouncements +backcover +background +backmatter +berichtigung +besprechungen +bibliografia +bibliographie +bibliography +bigdata +blankpage +blood +boardoftrustees +booknotes +booknotices +bookofabstracts +bookreview +bookreviews +bookreviewsandnotices +bookreviewssection +booksreceived +buchbesprechung +buchbesprechungen +bulletin +calendar +calendarofevents +calendarofmeetings +callforarticles +callforpapers +casereport +casereports +casestudy +chairmansopeningremarks +changes +chaos +chapter1 +chapter10 +chapter1introduction +chapter2 +chapter7 +chapteri +chapterone +chapteroneintroduction +chaptertwo +chapterx +citation +classes +classified +classifieds +closingremarks +collaborateurs +comment +commentaries +commentary +commentaryon +commenton +comments +commentto +committee +communication +communications +communicationstotheeditor +communiquedepresse +community +components +comptesrendus +computerscience +concludingremarks +conclusion +conclusions +conferencereport +congratulations +congresscalendar +conservation +content +contents +context +continuingeducation +continuingmedicaleducation +contributors +copyright +copyrightform +copyrightnotice +correction +corrections +correspondence +corrigenda +corrigendum +councilminutes +cover +coverimage +currentresearch +curriculumvitae +danksagung +dearreaders +decisionmaking +dedication +dedicatoria +definition +description +discussion +diskussion +distribution +documents +ear +economics +editorial +editorialadvisoryboard +editorialannouncement +editorialboard +editorialcomment +editorialcomments +editorialconsultants +editoriale +editorialeditorial +editorialforeword +editorialinformation +editorialintroduction +editorialintroductions +editorialnote +editorialnotes +editorialpreface +editorials +editorialsoftwaresurveysection +editorialstaff +editorialstatement +editorinchief +editors +editorschoice +editorscomment +editorscomments +editorscorner +editorscorrespondence +editorsforeword +editorsintroduction +editorsletter +editorsnote +editorsnotes +editorspage +editorspicks +editorspreface +education +einfuhrung +einleitung +electrophoresis +employment +endnotes +entrevista +entscheidungsverzeichnis +epilogue +equipment +errata +erratum +essay +essays +executivesummary +exercises +expediente +extendedabstracts +feature +features +fichatecnica +figure3 +finalexam +finalreport +focus +foreward +foreword +forthcomingarticles +forthcomingevents +fortherecord +forum +frequentlyaskedquestions +fromtheeditor +fromtheeditorinchief +fromtheeditors +fromtheeditorsdesk +fromthepresident +frontmatter +furtherreadings +genealogy +generaldiscussion +generalinformation +generalintroduction +germany +gettingstarted +glosario +glossary +glossaryofterms +guesteditorial +guesteditorsforeword +guesteditorsintroduction +guideforauthors +guidelinesforcontributors +health +heartfailure +highlights +highlightsfromthisissue +highlightsofthisissue +history +home +homework +hypothesis +iii +imageofthemonth +impactfactor +importantnotice +impressum +inbrief +index +indexofauthors +indexofauthorsandtitles +indice +indicegeneral +informationforauthors +informationtoauthors +inhalt +inhaltsverzeichnis +inleiding +inmemoriam +inreply +inresponse +insidethisissue +institutenews +instructionsforauthors +instructionstoauthors +interview +inthestudy +inthisissue +introducao +introduccion +introduction +introductionandoverview +introductiongenerale +introductiontotheissue +introductiontothespecialissue +introductorycomments +introductoryremarks +introduzione +inventions +invitedcommentary +issuesandevents +jobdescription +journalclub +journalscan +keywords +kurzkommentiert +languageteaching +lecture +letter +letterfromtheeditor +letterfromtheeditorinchief +letterfromtheeditors +letterfromthepresident +letters +letterstotheeditor +letterstotheeditors +lettertotheeditor +lettertotheeditors +liminaire +linearalgebra +linearregression +links +listedestableaux +listofabbreviations +listofcontributors +listoffigures +listofparticipants +listofpublications +listofreferees +listofreviewers +listoftables +literacy +literatur +literature +literaturecited +literaturereview +literaturrundschau +literaturverzeichnis +litteraturverzeichniss +livresrecus +lucina +lungcancer +magazin +maintenance +materials +materialsafetydatasheet +materialsandmethods +medicinalchemistry +meetingabstracts +meetingreport +meetings +meetingsandconferences +meetingsofinterest +membershipapplication +memoranda +memorandum +messagefromgeneralcochairs +messagefromthechairs +messagefromtheeditor +messagefromtheeditorinchief +messagefromthepresident +messagefromtheprogramchairs +messagefromtheprogramcochairs +metaanalysis +miscellanea +miscellaneous +miscellany +missionstatement +motivation +mrsnews +name +newbooks +newlyelectedmembersofthecollege +newproducts +news +newsandnotes +newsandreviews +newsandviews +newsbriefs +newsinbrief +newsnotes +newsviews +noii +note +notefromtheeditor +notes +notesandcomments +notesandnews +notesdelecture +notesforcontributors +notesoncontributors +notice +noticeboard +notitle +notitleavailable +nr +obituaries +obituary +online +openaccess +openingaddress +openingremarks +oralabstracts +oralpresentations +organizingcommittee +originalarticle +originalarticles +other +outline +overview +panorama +papers +paperstoappearinforthcomingissues +partone +personalandmiscellaneous +perspective +perspectives +philosophy +pictureofthemonth +place +pointofview +positionsavailable +poster +posterpresentations +postscript +preface +prefaceandacknowledgements +prefacetothesecondedition +preliminarymaterial +presentacio +presentacion +presentation +presidentialaddress +presidentsmessage +presidentsreport +pressrelease +print +printing +proceedings +proceedingsofthenationalacademyofsciences +profile +programcommittee +projectmanagement +prologue +publication +publichealth +publishersnote +question +questionsandanswers +radiology +readersforum +recensiones +recensions +recentpublications +redaktorensforord +referate +references +referenciasbibliograficas +regression +rehabilitation +rejoinder +remerciements +reply +replybyauthors +researchresearchers +resenas +resources +response +responsetothelettertotheeditor +results +resume +resumen +resumes +resumo +retraction +review +reviewarticle +revieweracknowledgement +revieweracknowledgement2013 +reviewers +reviewessay +reviews +reviewsanddescriptionsoftablesandbooks +reviewsofbooks +rezension +rezensionen +safety +section +security +selectedbibliography +shortcommunication +shorternotices +shortnotices +socialengineering +sociology +sommaire +sommario +specialreport +specialsection +specifications +spistresci +subjectindex +subscriptions +suggestedreadings +sumario +summaries +summariesofkeyjournalarticles +summary +summaryofproceedings +summer +sun +supplementarymaterial +symposium +symptom +synthese +tabledesmatieres +tableofcontents +tableofcontentsandprologue +technicalreport +theauthors +theauthorsreply +thebasics +theeditorsdesk +thefirstauthorreplies +thelancet +theoreticalbackground +thetimes +theworldbank +theyearinreview +thismonthin +thismonthinthejournal +timemanagement +titeleiinhaltsverzeichnis +title +titlepage +titlepagei +tocorrespondents +totheeditor +unitedkingdom +unitednations +unitedstates +upcomingevents +vorwort +website +welcome +whatshappening +whatsnew +workscited +yourquestionsanswered +zudiesemheft +zusammenfassung |