From 9d21a922386624b39b730849ab56c00a061f255f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 5 Sep 2018 15:04:14 -0700 Subject: blacklist -> denylist --- python/extraction_cdx_grobid.py | 8 +- python/extraction_ungrobided.py | 8 +- scalding/src/main/resources/slug-blacklist.txt | 554 --------------------- scalding/src/main/resources/slug-denylist.txt | 554 +++++++++++++++++++++ .../main/scala/sandcrawler/ScorableFeatures.scala | 6 +- .../scala/sandcrawler/ScorableFeaturesTest.scala | 2 +- 6 files changed, 566 insertions(+), 566 deletions(-) delete mode 100644 scalding/src/main/resources/slug-blacklist.txt create mode 100644 scalding/src/main/resources/slug-denylist.txt diff --git a/python/extraction_cdx_grobid.py b/python/extraction_cdx_grobid.py index 040538c..b27d053 100755 --- a/python/extraction_cdx_grobid.py +++ b/python/extraction_cdx_grobid.py @@ -37,7 +37,7 @@ from grobid2json import teixml2json sentry_client = raven.Client() # Specific poison-pill rows we should skip -KEY_BLACKLIST = ( +KEY_DENYLIST = ( 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format" ) @@ -210,9 +210,9 @@ class MRExtractCdxGrobid(MRJob): yield _, status return key = info['key'] - if key in KEY_BLACKLIST: - self.increment_counter('lines', 'blacklist') - yield _, dict(status='blacklist', key=key) + if key in KEY_DENYLIST: + self.increment_counter('lines', 'denylist') + yield _, dict(status='denylist', key=key) return # Note: this may not get "cleared" correctly diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index 4074112..972b8f9 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -37,7 +37,7 @@ from grobid2json import teixml2json sentry_client = raven.Client() # Specific poison-pill rows we should skip -KEY_BLACKLIST = ( +KEY_DENYLIST = ( 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format" ) @@ -212,9 +212,9 @@ class MRExtractUnGrobided(MRJob): yield _, status return key = info['key'] - if key in KEY_BLACKLIST: - self.increment_counter('lines', 'blacklist') - yield _, dict(status='blacklist', key=key) + if key in KEY_DENYLIST: + self.increment_counter('lines', 'denylist') + yield _, dict(status='denylist', key=key) return # Note: this may not get "cleared" correctly diff --git a/scalding/src/main/resources/slug-blacklist.txt b/scalding/src/main/resources/slug-blacklist.txt deleted file mode 100644 index 926dbd5..0000000 --- a/scalding/src/main/resources/slug-blacklist.txt +++ /dev/null @@ -1,554 +0,0 @@ -abbreviations -abbreviationsandacronyms -aboutauthors -abouttheauthor -abouttheauthors -aboutthecover -abouttheeditors -abreviations -abstract -abstractnotsubmittedforonlinepublication -abstracts -abstractsofaapaposterandpodiumpresentations -abstractsofcommunications -abstractsofthesesfromthescandinaviancountries -abstractwithdrawn -acknowledgement -acknowledgements -acknowledgementsvii -acknowledgementtoreferees -acknowledgementtoreviewers -acknowledgment -acknowledgmentofreferees -acknowledgments -addendum -additionalresources -address -advertisersindex -affect -affiliation -afterword -agenda -agradecimentos -agradecimientos -aimsandscope -analysis -annexa -announcement -announcements -annualacknowledgementofmanuscriptreviewers -anotefromtheeditor -appendices -appendix -appendix1 -appendixa -appendixb -appointmentsandstaffchanges -approximation -apresentacao -article -articles -articlesofsignificantinterestselectedfromthisissuebytheeditors -associationnews -ataglance -atribute -attention -authorguidelines -authorindex -authorindexforvolume81 -authorreply -authors -authorsreply -authorsresponse -avantpropos -award -awardsappointmentsannouncements -backcover -background -backmatter -berichtigung -besprechungen -bibliografia -bibliographie -bibliography -bigdata -blankpage -blood -boardoftrustees -booknotes -booknotices -bookofabstracts -bookreview -bookreviews -bookreviewsandnotices -bookreviewssection -booksreceived -buchbesprechung -buchbesprechungen -bulletin -calendar -calendarofevents -calendarofmeetings -callforarticles -callforpapers -casereport -casereports -casestudy -chairmansopeningremarks -changes -chaos -chapter1 -chapter10 -chapter1introduction -chapter2 -chapter7 -chapteri -chapterone -chapteroneintroduction -chaptertwo -chapterx -citation -classes -classified -classifieds -closingremarks -collaborateurs -comment -commentaries -commentary -commentaryon -commenton -comments -commentto -committee -communication -communications -communicationstotheeditor -communiquedepresse -community -components -comptesrendus -computerscience -concludingremarks -conclusion -conclusions -conferencereport -congratulations -congresscalendar -conservation -content -contents -context -continuingeducation -continuingmedicaleducation -contributors -copyright -copyrightform -copyrightnotice -correction -corrections -correspondence -corrigenda -corrigendum -councilminutes -cover -coverimage -currentresearch -curriculumvitae -danksagung -dearreaders -decisionmaking -dedication -dedicatoria -definition -description -discussion -diskussion -distribution -documents -ear -economics -editorial -editorialadvisoryboard -editorialannouncement -editorialboard -editorialcomment -editorialcomments -editorialconsultants -editoriale -editorialeditorial -editorialforeword -editorialinformation -editorialintroduction -editorialintroductions -editorialnote -editorialnotes -editorialpreface -editorials -editorialsoftwaresurveysection -editorialstaff -editorialstatement -editorinchief -editors -editorschoice -editorscomment -editorscomments -editorscorner -editorscorrespondence -editorsforeword -editorsintroduction -editorsletter -editorsnote -editorsnotes -editorspage -editorspicks -editorspreface -education -einfuhrung -einleitung -electrophoresis -employment -endnotes -entrevista -entscheidungsverzeichnis -epilogue -equipment -errata -erratum -essay -essays -executivesummary -exercises -expediente -extendedabstracts -feature -features -fichatecnica -figure3 -finalexam -finalreport -focus -foreward -foreword -forthcomingarticles -forthcomingevents -fortherecord -forum -frequentlyaskedquestions -fromtheeditor -fromtheeditorinchief -fromtheeditors -fromtheeditorsdesk -fromthepresident -frontmatter -furtherreadings -genealogy -generaldiscussion -generalinformation -generalintroduction -germany -gettingstarted -glosario -glossary -glossaryofterms -guesteditorial -guesteditorsforeword -guesteditorsintroduction -guideforauthors -guidelinesforcontributors -health -heartfailure -highlights -highlightsfromthisissue -highlightsofthisissue -history -home -homework -hypothesis -iii -imageofthemonth -importantnotice -impressum -inbrief -index -indexofauthors -indexofauthorsandtitles -indice -indicegeneral -informationforauthors -informationtoauthors -inhalt -inhaltsverzeichnis -inleiding -inmemoriam -inreply -inresponse -insidethisissue -institutenews -instructionsforauthors -instructionstoauthors -interview -inthestudy -inthisissue -introducao -introduccion -introduction -introductionandoverview -introductiongenerale -introductiontotheissue -introductiontothespecialissue -introductorycomments -introductoryremarks -introduzione -inventions -invitedcommentary -issuesandevents -jobdescription -journalclub -journalscan -keywords -kurzkommentiert -languageteaching -lecture -letter -letterfromtheeditor -letterfromtheeditorinchief -letterfromtheeditors -letterfromthepresident -letters -letterstotheeditor -letterstotheeditors -lettertotheeditor -lettertotheeditors -liminaire -linearalgebra -linearregression -links -listedestableaux -listofabbreviations -listofcontributors -listoffigures -listofparticipants -listofpublications -listofreferees -listofreviewers -listoftables -literacy -literatur -literature -literaturecited -literaturereview -literaturrundschau -literaturverzeichnis -litteraturverzeichniss -livresrecus -lucina -lungcancer -magazin -maintenance -materials -materialsafetydatasheet -materialsandmethods -medicinalchemistry -meetingabstracts -meetingreport -meetings -meetingsandconferences -meetingsofinterest -membershipapplication -memoranda -memorandum -messagefromgeneralcochairs -messagefromthechairs -messagefromtheeditor -messagefromtheeditorinchief -messagefromthepresident -messagefromtheprogramchairs -messagefromtheprogramcochairs -metaanalysis -miscellanea -miscellaneous -miscellany -missionstatement -motivation -mrsnews -name -newbooks -newlyelectedmembersofthecollege -newproducts -news -newsandnotes -newsandreviews -newsandviews -newsbriefs -newsinbrief -newsnotes -newsviews -noii -note -notefromtheeditor -notes -notesandcomments -notesandnews -notesdelecture -notesforcontributors -notesoncontributors -notice -noticeboard -notitle -notitleavailable -obituaries -obituary -online -openaccess -openingaddress -openingremarks -oralabstracts -oralpresentations -organizingcommittee -originalarticle -originalarticles -other -outline -overview -panorama -papers -paperstoappearinforthcomingissues -partone -personalandmiscellaneous -perspective -perspectives -philosophy -pictureofthemonth -place -pointofview -positionsavailable -poster -posterpresentations -postscript -preface -prefaceandacknowledgements -prefacetothesecondedition -preliminarymaterial -presentacio -presentacion -presentation -presidentialaddress -presidentsmessage -presidentsreport -pressrelease -print -printing -proceedings -proceedingsofthenationalacademyofsciences -profile -programcommittee -projectmanagement -prologue -publication -publichealth -publishersnote -question -questionsandanswers -radiology -readersforum -recensiones -recensions -recentpublications -redaktorensforord -referate -references -referenciasbibliograficas -regression -rehabilitation -rejoinder -remerciements -reply -replybyauthors -researchresearchers -resenas -resources -response -responsetothelettertotheeditor -results -resume -resumen -resumes -resumo -retraction -review -reviewarticle -revieweracknowledgement -revieweracknowledgement2013 -reviewers -reviewessay -reviews -reviewsanddescriptionsoftablesandbooks -reviewsofbooks -rezension -rezensionen -safety -section -security -selectedbibliography -shortcommunication -shorternotices -shortnotices -socialengineering -sociology -sommaire -sommario -specialreport -specialsection -specifications -spistresci -subjectindex -subscriptions -suggestedreadings -sumario -summaries -summariesofkeyjournalarticles -summary -summaryofproceedings -summer -sun -supplementarymaterial -symposium -symptom -synthese -tabledesmatieres -tableofcontents -tableofcontentsandprologue -technicalreport -theauthors -theauthorsreply -thebasics -theeditorsdesk -thefirstauthorreplies -thelancet -theoreticalbackground -thetimes -theworldbank -theyearinreview -thismonthin -thismonthinthejournal -timemanagement -titeleiinhaltsverzeichnis -title -titlepage -titlepagei -tocorrespondents -totheeditor -unitedkingdom -unitednations -unitedstates -upcomingevents -vorwort -website -welcome -whatshappening -whatsnew -workscited -yourquestionsanswered -zudiesemheft -zusammenfassung diff --git a/scalding/src/main/resources/slug-denylist.txt b/scalding/src/main/resources/slug-denylist.txt new file mode 100644 index 0000000..926dbd5 --- /dev/null +++ b/scalding/src/main/resources/slug-denylist.txt @@ -0,0 +1,554 @@ +abbreviations +abbreviationsandacronyms +aboutauthors +abouttheauthor +abouttheauthors +aboutthecover +abouttheeditors +abreviations +abstract +abstractnotsubmittedforonlinepublication +abstracts +abstractsofaapaposterandpodiumpresentations +abstractsofcommunications +abstractsofthesesfromthescandinaviancountries +abstractwithdrawn +acknowledgement +acknowledgements +acknowledgementsvii +acknowledgementtoreferees +acknowledgementtoreviewers +acknowledgment +acknowledgmentofreferees +acknowledgments +addendum +additionalresources +address +advertisersindex +affect +affiliation +afterword +agenda +agradecimentos +agradecimientos +aimsandscope +analysis +annexa +announcement +announcements +annualacknowledgementofmanuscriptreviewers +anotefromtheeditor +appendices +appendix +appendix1 +appendixa +appendixb +appointmentsandstaffchanges +approximation +apresentacao +article +articles +articlesofsignificantinterestselectedfromthisissuebytheeditors +associationnews +ataglance +atribute +attention +authorguidelines +authorindex +authorindexforvolume81 +authorreply +authors +authorsreply +authorsresponse +avantpropos +award +awardsappointmentsannouncements +backcover +background +backmatter +berichtigung +besprechungen +bibliografia +bibliographie +bibliography +bigdata +blankpage +blood +boardoftrustees +booknotes +booknotices +bookofabstracts +bookreview +bookreviews +bookreviewsandnotices +bookreviewssection +booksreceived +buchbesprechung +buchbesprechungen +bulletin +calendar +calendarofevents +calendarofmeetings +callforarticles +callforpapers +casereport +casereports +casestudy +chairmansopeningremarks +changes +chaos +chapter1 +chapter10 +chapter1introduction +chapter2 +chapter7 +chapteri +chapterone +chapteroneintroduction +chaptertwo +chapterx +citation +classes +classified +classifieds +closingremarks +collaborateurs +comment +commentaries +commentary +commentaryon +commenton +comments +commentto +committee +communication +communications +communicationstotheeditor +communiquedepresse +community +components +comptesrendus +computerscience +concludingremarks +conclusion +conclusions +conferencereport +congratulations +congresscalendar +conservation +content +contents +context +continuingeducation +continuingmedicaleducation +contributors +copyright +copyrightform +copyrightnotice +correction +corrections +correspondence +corrigenda +corrigendum +councilminutes +cover +coverimage +currentresearch +curriculumvitae +danksagung +dearreaders +decisionmaking +dedication +dedicatoria +definition +description +discussion +diskussion +distribution +documents +ear +economics +editorial +editorialadvisoryboard +editorialannouncement +editorialboard +editorialcomment +editorialcomments +editorialconsultants +editoriale +editorialeditorial +editorialforeword +editorialinformation +editorialintroduction +editorialintroductions +editorialnote +editorialnotes +editorialpreface +editorials +editorialsoftwaresurveysection +editorialstaff +editorialstatement +editorinchief +editors +editorschoice +editorscomment +editorscomments +editorscorner +editorscorrespondence +editorsforeword +editorsintroduction +editorsletter +editorsnote +editorsnotes +editorspage +editorspicks +editorspreface +education +einfuhrung +einleitung +electrophoresis +employment +endnotes +entrevista +entscheidungsverzeichnis +epilogue +equipment +errata +erratum +essay +essays +executivesummary +exercises +expediente +extendedabstracts +feature +features +fichatecnica +figure3 +finalexam +finalreport +focus +foreward +foreword +forthcomingarticles +forthcomingevents +fortherecord +forum +frequentlyaskedquestions +fromtheeditor +fromtheeditorinchief +fromtheeditors +fromtheeditorsdesk +fromthepresident +frontmatter +furtherreadings +genealogy +generaldiscussion +generalinformation +generalintroduction +germany +gettingstarted +glosario +glossary +glossaryofterms +guesteditorial +guesteditorsforeword +guesteditorsintroduction +guideforauthors +guidelinesforcontributors +health +heartfailure +highlights +highlightsfromthisissue +highlightsofthisissue +history +home +homework +hypothesis +iii +imageofthemonth +importantnotice +impressum +inbrief +index +indexofauthors +indexofauthorsandtitles +indice +indicegeneral +informationforauthors +informationtoauthors +inhalt +inhaltsverzeichnis +inleiding +inmemoriam +inreply +inresponse +insidethisissue +institutenews +instructionsforauthors +instructionstoauthors +interview +inthestudy +inthisissue +introducao +introduccion +introduction +introductionandoverview +introductiongenerale +introductiontotheissue +introductiontothespecialissue +introductorycomments +introductoryremarks +introduzione +inventions +invitedcommentary +issuesandevents +jobdescription +journalclub +journalscan +keywords +kurzkommentiert +languageteaching +lecture +letter +letterfromtheeditor +letterfromtheeditorinchief +letterfromtheeditors +letterfromthepresident +letters +letterstotheeditor +letterstotheeditors +lettertotheeditor +lettertotheeditors +liminaire +linearalgebra +linearregression +links +listedestableaux +listofabbreviations +listofcontributors +listoffigures +listofparticipants +listofpublications +listofreferees +listofreviewers +listoftables +literacy +literatur +literature +literaturecited +literaturereview +literaturrundschau +literaturverzeichnis +litteraturverzeichniss +livresrecus +lucina +lungcancer +magazin +maintenance +materials +materialsafetydatasheet +materialsandmethods +medicinalchemistry +meetingabstracts +meetingreport +meetings +meetingsandconferences +meetingsofinterest +membershipapplication +memoranda +memorandum +messagefromgeneralcochairs +messagefromthechairs +messagefromtheeditor +messagefromtheeditorinchief +messagefromthepresident +messagefromtheprogramchairs +messagefromtheprogramcochairs +metaanalysis +miscellanea +miscellaneous +miscellany +missionstatement +motivation +mrsnews +name +newbooks +newlyelectedmembersofthecollege +newproducts +news +newsandnotes +newsandreviews +newsandviews +newsbriefs +newsinbrief +newsnotes +newsviews +noii +note +notefromtheeditor +notes +notesandcomments +notesandnews +notesdelecture +notesforcontributors +notesoncontributors +notice +noticeboard +notitle +notitleavailable +obituaries +obituary +online +openaccess +openingaddress +openingremarks +oralabstracts +oralpresentations +organizingcommittee +originalarticle +originalarticles +other +outline +overview +panorama +papers +paperstoappearinforthcomingissues +partone +personalandmiscellaneous +perspective +perspectives +philosophy +pictureofthemonth +place +pointofview +positionsavailable +poster +posterpresentations +postscript +preface +prefaceandacknowledgements +prefacetothesecondedition +preliminarymaterial +presentacio +presentacion +presentation +presidentialaddress +presidentsmessage +presidentsreport +pressrelease +print +printing +proceedings +proceedingsofthenationalacademyofsciences +profile +programcommittee +projectmanagement +prologue +publication +publichealth +publishersnote +question +questionsandanswers +radiology +readersforum +recensiones +recensions +recentpublications +redaktorensforord +referate +references +referenciasbibliograficas +regression +rehabilitation +rejoinder +remerciements +reply +replybyauthors +researchresearchers +resenas +resources +response +responsetothelettertotheeditor +results +resume +resumen +resumes +resumo +retraction +review +reviewarticle +revieweracknowledgement +revieweracknowledgement2013 +reviewers +reviewessay +reviews +reviewsanddescriptionsoftablesandbooks +reviewsofbooks +rezension +rezensionen +safety +section +security +selectedbibliography +shortcommunication +shorternotices +shortnotices +socialengineering +sociology +sommaire +sommario +specialreport +specialsection +specifications +spistresci +subjectindex +subscriptions +suggestedreadings +sumario +summaries +summariesofkeyjournalarticles +summary +summaryofproceedings +summer +sun +supplementarymaterial +symposium +symptom +synthese +tabledesmatieres +tableofcontents +tableofcontentsandprologue +technicalreport +theauthors +theauthorsreply +thebasics +theeditorsdesk +thefirstauthorreplies +thelancet +theoreticalbackground +thetimes +theworldbank +theyearinreview +thismonthin +thismonthinthejournal +timemanagement +titeleiinhaltsverzeichnis +title +titlepage +titlepagei +tocorrespondents +totheeditor +unitedkingdom +unitednations +unitedstates +upcomingevents +vorwort +website +welcome +whatshappening +whatsnew +workscited +yourquestionsanswered +zudiesemheft +zusammenfassung diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index be2b495..95a39aa 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -8,8 +8,8 @@ import scala.util.parsing.json.JSONObject object ScorableFeatures { // TODO: Add exception handling. - val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") - val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet + val fileStream : InputStream = getClass.getResourceAsStream("/slug-denylist.txt") + val SlugDenylist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet fileStream.close val MinSlugLength = 8 @@ -44,7 +44,7 @@ class ScorableFeatures private(title : String, authors : List[Any] = List(), yea val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") if (slug.isEmpty || slug == null - || (ScorableFeatures.SlugBlacklist contains slug) + || (ScorableFeatures.SlugDenylist contains slug) || (slug.length < ScorableFeatures.MinSlugLength)) { None } else { diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 112a5e5..c847296 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -42,7 +42,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { it should "filter stub titles" in { titleToSlug("abstract") shouldBe (None) titleToSlug("title!") shouldBe (None) - titleToSlug("a real title which is not on blacklist") shouldBe Some("arealtitlewhichisnotonblacklist") + titleToSlug("a real title which is not on denylist") shouldBe Some("arealtitlewhichisnotondenylist") } it should "strip special characters" in { -- cgit v1.2.3