aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-10 19:27:17 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-10 19:27:19 -0800
commitba035d8641a5e94d93448bb9a0cd56c7756d7055 (patch)
tree5f8d82eeba6c4dd37095b8c67a70217ffc8eba97
parent84fd65b58e33f87b544e2875d87daa941587c511 (diff)
downloadfuzzycat-ba035d8641a5e94d93448bb9a0cd56c7756d7055.tar.gz
fuzzycat-ba035d8641a5e94d93448bb9a0cd56c7756d7055.zip
add support for key denylist
This is to filter out cluster rows where the resulting key is in a given text file (one key per line). The intent is to filter out records with either poor metadata, or very generic metadata, for fuzzy matching. Eg, in many cases it is better to just not try matching "Letter to the Editor" to any record. This won't always be the case; we might have journal, volume, issue, and page, which would allow a match. So this can be specified on the command line.
-rw-r--r--fuzzycat/cluster.py13
-rw-r--r--fuzzycat/main.py6
-rw-r--r--fuzzycat/sandcrawler-title-denylist.txt559
3 files changed, 574 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index e4a36bf..dde0688 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -246,6 +246,7 @@ class Cluster:
output=sys.stdout,
keyfunc=lambda v: v,
prefix='fuzzycat-',
+ key_denylist=None,
tmpdir=None):
"""
Files can be a list of files or "-" for stdin.
@@ -256,6 +257,7 @@ class Cluster:
self.prefix = prefix
self.tmpdir = tmpdir
self.logger = logging.getLogger('fuzzycat.cluster')
+ self.key_denylist = key_denylist
def run(self):
"""
@@ -266,12 +268,15 @@ class Cluster:
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
for line in fileinput.input(files=self.files):
try:
- id, key = keyfunc(json.loads(line))
- print("{}\t{}".format(id, key), file=tf)
+ ident, key = keyfunc(json.loads(line))
except (KeyError, ValueError):
counter["key_extraction_failed"] += 1
- else:
- counter["key_ok"] += 1
+ continue
+ if self.key_denylist and key in self.key_denylist:
+ counter["key_denylist"] += 1
+ continue
+ counter["key_ok"] += 1
+ print("{}\t{}".format(ident, key), file=tf)
sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir)
with open(sbc) as f:
comment = keyfunc.__name__
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index c7ba23d..bfce68e 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -33,9 +33,14 @@ def run_cluster(args):
'tnysi': release_key_title_nysiis,
'tss': release_key_title_ngram,
}
+ key_denylist = None
+ if args.key_denylist:
+ with open(args.key_denylist, 'r') as f:
+ key_denylist = [l.strip() for l in f.readlines()]
cluster = Cluster(files=args.files,
keyfunc=types.get(args.type),
tmpdir=args.tmpdir,
+ key_denylist=key_denylist,
prefix=args.prefix)
stats = cluster.run()
logger.debug(json.dumps(dict(stats)))
@@ -83,6 +88,7 @@ if __name__ == '__main__':
sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
sub_cluster.set_defaults(func=run_cluster)
sub_cluster.add_argument('-f', '--files', default="-", help='input files')
+ sub_cluster.add_argument('--key-denylist', help='file path to key denylist')
sub_cluster.add_argument('-t',
'--type',
default='title',
diff --git a/fuzzycat/sandcrawler-title-denylist.txt b/fuzzycat/sandcrawler-title-denylist.txt
new file mode 100644
index 0000000..ef575b4
--- /dev/null
+++ b/fuzzycat/sandcrawler-title-denylist.txt
@@ -0,0 +1,559 @@
+abbreviations
+abbreviationsandacronyms
+aboutauthors
+abouttheauthor
+abouttheauthors
+aboutthecover
+abouttheeditors
+abreviations
+abstract
+abstractnotsubmittedforonlinepublication
+abstractoriginalarticle
+abstracts
+abstractsofaapaposterandpodiumpresentations
+abstractsofcommunications
+abstractsofthesesfromthescandinaviancountries
+abstractwithdrawn
+acknowledgement
+acknowledgements
+acknowledgementsvii
+acknowledgementtoreferees
+acknowledgementtoreviewers
+acknowledgment
+acknowledgmentofreferees
+acknowledgments
+addendum
+additionalresources
+address
+advertisement
+advertisersindex
+affect
+affiliation
+afterword
+agenda
+agradecimentos
+agradecimientos
+aimsandscope
+analysis
+annexa
+announcement
+announcements
+annualacknowledgementofmanuscriptreviewers
+anotefromtheeditor
+appendices
+appendix
+appendix1
+appendixa
+appendixb
+appointmentsandstaffchanges
+approximation
+apresentacao
+article
+articlenumber
+articles
+articlesofsignificantinterestselectedfromthisissuebytheeditors
+associationnews
+ataglance
+atribute
+attention
+authorguidelines
+authorindex
+authorindexforvolume81
+authorreply
+authors
+authorsreply
+authorsresponse
+avantpropos
+award
+awardsappointmentsannouncements
+backcover
+background
+backmatter
+berichtigung
+besprechungen
+bibliografia
+bibliographie
+bibliography
+bigdata
+blankpage
+blood
+boardoftrustees
+booknotes
+booknotices
+bookofabstracts
+bookreview
+bookreviews
+bookreviewsandnotices
+bookreviewssection
+booksreceived
+buchbesprechung
+buchbesprechungen
+bulletin
+calendar
+calendarofevents
+calendarofmeetings
+callforarticles
+callforpapers
+casereport
+casereports
+casestudy
+chairmansopeningremarks
+changes
+chaos
+chapter1
+chapter10
+chapter1introduction
+chapter2
+chapter7
+chapteri
+chapterone
+chapteroneintroduction
+chaptertwo
+chapterx
+citation
+classes
+classified
+classifieds
+closingremarks
+collaborateurs
+comment
+commentaries
+commentary
+commentaryon
+commenton
+comments
+commentto
+committee
+communication
+communications
+communicationstotheeditor
+communiquedepresse
+community
+components
+comptesrendus
+computerscience
+concludingremarks
+conclusion
+conclusions
+conferencereport
+congratulations
+congresscalendar
+conservation
+content
+contents
+context
+continuingeducation
+continuingmedicaleducation
+contributors
+copyright
+copyrightform
+copyrightnotice
+correction
+corrections
+correspondence
+corrigenda
+corrigendum
+councilminutes
+cover
+coverimage
+currentresearch
+curriculumvitae
+danksagung
+dearreaders
+decisionmaking
+dedication
+dedicatoria
+definition
+description
+discussion
+diskussion
+distribution
+documents
+ear
+economics
+editorial
+editorialadvisoryboard
+editorialannouncement
+editorialboard
+editorialcomment
+editorialcomments
+editorialconsultants
+editoriale
+editorialeditorial
+editorialforeword
+editorialinformation
+editorialintroduction
+editorialintroductions
+editorialnote
+editorialnotes
+editorialpreface
+editorials
+editorialsoftwaresurveysection
+editorialstaff
+editorialstatement
+editorinchief
+editors
+editorschoice
+editorscomment
+editorscomments
+editorscorner
+editorscorrespondence
+editorsforeword
+editorsintroduction
+editorsletter
+editorsnote
+editorsnotes
+editorspage
+editorspicks
+editorspreface
+education
+einfuhrung
+einleitung
+electrophoresis
+employment
+endnotes
+entrevista
+entscheidungsverzeichnis
+epilogue
+equipment
+errata
+erratum
+essay
+essays
+executivesummary
+exercises
+expediente
+extendedabstracts
+feature
+features
+fichatecnica
+figure3
+finalexam
+finalreport
+focus
+foreward
+foreword
+forthcomingarticles
+forthcomingevents
+fortherecord
+forum
+frequentlyaskedquestions
+fromtheeditor
+fromtheeditorinchief
+fromtheeditors
+fromtheeditorsdesk
+fromthepresident
+frontmatter
+furtherreadings
+genealogy
+generaldiscussion
+generalinformation
+generalintroduction
+germany
+gettingstarted
+glosario
+glossary
+glossaryofterms
+guesteditorial
+guesteditorsforeword
+guesteditorsintroduction
+guideforauthors
+guidelinesforcontributors
+health
+heartfailure
+highlights
+highlightsfromthisissue
+highlightsofthisissue
+history
+home
+homework
+hypothesis
+iii
+imageofthemonth
+impactfactor
+importantnotice
+impressum
+inbrief
+index
+indexofauthors
+indexofauthorsandtitles
+indice
+indicegeneral
+informationforauthors
+informationtoauthors
+inhalt
+inhaltsverzeichnis
+inleiding
+inmemoriam
+inreply
+inresponse
+insidethisissue
+institutenews
+instructionsforauthors
+instructionstoauthors
+interview
+inthestudy
+inthisissue
+introducao
+introduccion
+introduction
+introductionandoverview
+introductiongenerale
+introductiontotheissue
+introductiontothespecialissue
+introductorycomments
+introductoryremarks
+introduzione
+inventions
+invitedcommentary
+issuesandevents
+jobdescription
+journalclub
+journalscan
+keywords
+kurzkommentiert
+languageteaching
+lecture
+letter
+letterfromtheeditor
+letterfromtheeditorinchief
+letterfromtheeditors
+letterfromthepresident
+letters
+letterstotheeditor
+letterstotheeditors
+lettertotheeditor
+lettertotheeditors
+liminaire
+linearalgebra
+linearregression
+links
+listedestableaux
+listofabbreviations
+listofcontributors
+listoffigures
+listofparticipants
+listofpublications
+listofreferees
+listofreviewers
+listoftables
+literacy
+literatur
+literature
+literaturecited
+literaturereview
+literaturrundschau
+literaturverzeichnis
+litteraturverzeichniss
+livresrecus
+lucina
+lungcancer
+magazin
+maintenance
+materials
+materialsafetydatasheet
+materialsandmethods
+medicinalchemistry
+meetingabstracts
+meetingreport
+meetings
+meetingsandconferences
+meetingsofinterest
+membershipapplication
+memoranda
+memorandum
+messagefromgeneralcochairs
+messagefromthechairs
+messagefromtheeditor
+messagefromtheeditorinchief
+messagefromthepresident
+messagefromtheprogramchairs
+messagefromtheprogramcochairs
+metaanalysis
+miscellanea
+miscellaneous
+miscellany
+missionstatement
+motivation
+mrsnews
+name
+newbooks
+newlyelectedmembersofthecollege
+newproducts
+news
+newsandnotes
+newsandreviews
+newsandviews
+newsbriefs
+newsinbrief
+newsnotes
+newsviews
+noii
+note
+notefromtheeditor
+notes
+notesandcomments
+notesandnews
+notesdelecture
+notesforcontributors
+notesoncontributors
+notice
+noticeboard
+notitle
+notitleavailable
+nr
+obituaries
+obituary
+online
+openaccess
+openingaddress
+openingremarks
+oralabstracts
+oralpresentations
+organizingcommittee
+originalarticle
+originalarticles
+other
+outline
+overview
+panorama
+papers
+paperstoappearinforthcomingissues
+partone
+personalandmiscellaneous
+perspective
+perspectives
+philosophy
+pictureofthemonth
+place
+pointofview
+positionsavailable
+poster
+posterpresentations
+postscript
+preface
+prefaceandacknowledgements
+prefacetothesecondedition
+preliminarymaterial
+presentacio
+presentacion
+presentation
+presidentialaddress
+presidentsmessage
+presidentsreport
+pressrelease
+print
+printing
+proceedings
+proceedingsofthenationalacademyofsciences
+profile
+programcommittee
+projectmanagement
+prologue
+publication
+publichealth
+publishersnote
+question
+questionsandanswers
+radiology
+readersforum
+recensiones
+recensions
+recentpublications
+redaktorensforord
+referate
+references
+referenciasbibliograficas
+regression
+rehabilitation
+rejoinder
+remerciements
+reply
+replybyauthors
+researchresearchers
+resenas
+resources
+response
+responsetothelettertotheeditor
+results
+resume
+resumen
+resumes
+resumo
+retraction
+review
+reviewarticle
+revieweracknowledgement
+revieweracknowledgement2013
+reviewers
+reviewessay
+reviews
+reviewsanddescriptionsoftablesandbooks
+reviewsofbooks
+rezension
+rezensionen
+safety
+section
+security
+selectedbibliography
+shortcommunication
+shorternotices
+shortnotices
+socialengineering
+sociology
+sommaire
+sommario
+specialreport
+specialsection
+specifications
+spistresci
+subjectindex
+subscriptions
+suggestedreadings
+sumario
+summaries
+summariesofkeyjournalarticles
+summary
+summaryofproceedings
+summer
+sun
+supplementarymaterial
+symposium
+symptom
+synthese
+tabledesmatieres
+tableofcontents
+tableofcontentsandprologue
+technicalreport
+theauthors
+theauthorsreply
+thebasics
+theeditorsdesk
+thefirstauthorreplies
+thelancet
+theoreticalbackground
+thetimes
+theworldbank
+theyearinreview
+thismonthin
+thismonthinthejournal
+timemanagement
+titeleiinhaltsverzeichnis
+title
+titlepage
+titlepagei
+tocorrespondents
+totheeditor
+unitedkingdom
+unitednations
+unitedstates
+upcomingevents
+vorwort
+website
+welcome
+whatshappening
+whatsnew
+workscited
+yourquestionsanswered
+zudiesemheft
+zusammenfassung