diff options
-rw-r--r-- | python/notes/version_4.md | 161 | ||||
-rw-r--r-- | python/refcat/tasks.py | 6 |
2 files changed, 164 insertions, 3 deletions
diff --git a/python/notes/version_4.md b/python/notes/version_4.md index bf3f480..83d397b 100644 --- a/python/notes/version_4.md +++ b/python/notes/version_4.md @@ -567,3 +567,164 @@ on page one of references, there is "Gaussian processes in machine learning" https://www.researchgate.net/profile/Olivier_Bousquet/publication/238718428_Advanced_Lectures_on_Machine_Learning_ML_Summer_Schools_2003_Canberra_Australia_February_2-14_2003_Tubingen_Germany_August_4-16_2003_Revised_Lectures/links/02e7e52c5870850311000000/Advanced-Lectures-on-Machine-Learning-ML-Summer-Schools-2003-Canberra-Australia-February-2-14-2003-Tuebingen-Germany-August-4-16-2003-Revised-Lectures.pdf#page=70 - the paper itself does not contain a reference -- in the whole document. + +## OL fuzzy different + +Reasons, why pairs were marked as *different*: + +``` +$ zstdcat -T UnmatchedOpenLibraryMatchTable/date-2021-05-06.tsv.zst | grep ^different | cut -f2 | LC_ALL=C sort -S50% | uniq -c | sort -nr +47324670 year +46016349 contribintersectionempty + 582618 pagecount + 460 titlefilename + 25 numdiff +``` + +The `year` may refer to different editions: + +* https://fatcat.wiki/release/kngofkvoo5cinj4wqerrey4tpi/references +* https://openlibrary.org/works/OL16286792W/One_hundred_and_seventeen_days?edition=onehundredsevent00firs + +> 117 Days: An Account of Confinement and Interrogation under the South African +> 90-Day Detention Law.2006 | vs This edition was published in 1965 by Penguin +> Books + +## Data mismatch + +* FE: https://fatcat.wiki/release/niivpohpabhajdsf35x7hr4efm/references, [8]: 2011 + +refs (2017 only) + +``` +{ + "container_name": "19 & 20: Notes for a New Social Protagonism", + "container": { + "container_type": "", + "ident": "", + "issnl": "", + "name": "", + "publisher": "", + "revision": "", + "state": "", + "wikidata_qid": "" + }, + "contribs": [ + { + "raw_name": "Colective Situaciones" + } + ], + "ext_ids": {}, + "ident": "niivpohpabhajdsf35x7hr4efm", + "release_year": "2017", + "work_id": "7eghl5lcivfmha6d4uavrrkpce", + "extra": { + "crossref": {}, + "datacite": {}, + "skate": { + "status": "ref", + "ref": { + "index": 7, + "key": "\nkey\n\t\t\t\t20171225032503_CIT0007" + }, + "rg": {}, + "resolved_container_name": "" + }, + "ol": {} + } +} +{ + "container_name": "A Dictionary of Marxist Thought (2nd ed.)", + "container": { + "container_type": "", + "ident": "", + "issnl": "", + "name": "", + "publisher": "", + "revision": "", + "state": "", + "wikidata_qid": "" + }, + "ext_ids": {}, + "ident": "niivpohpabhajdsf35x7hr4efm", + "release_year": "2017", + "title": "Price of production and the transformation problem", + "work_id": "7eghl5lcivfmha6d4uavrrkpce", + "extra": { + "crossref": {}, + "datacite": {}, + "skate": { + "status": "ref", + "ref": { + "index": 12, + "key": "\nkey\n\t\t\t\t20171225032503_CIT0012" + }, + "rg": {}, + "resolved_container_name": "" + }, + "ol": {} + } +} +{ + "container_name": "A Grammar of the Multitude: For an Analysis of Contemporary Forms of Life", + "container": { + "container_type": "", + "ident": "", + "issnl": "", + "name": "", + "publisher": "", + "revision": "", + "state": "", + "wikidata_qid": "" + }, + "ext_ids": {}, + "ident": "niivpohpabhajdsf35x7hr4efm", + "release_year": "2017", + "work_id": "7eghl5lcivfmha6d4uavrrkpce", + "extra": { + "crossref": {}, + "datacite": {}, + "skate": { + "status": "ref", + "ref": { + "index": 45, + "key": "\nkey\n\t\t\t\t20171225032503_CIT0044" + }, + "rg": {}, + "resolved_container_name": "" + }, + "ol": {} + } +} +{ + "container_name": "An Introduction to the Three Volumes of Karl Marx's Capital", + "container": { + "container_type": "", + "ident": "", + "issnl": "", + "name": "", + "publisher": "", + "revision": "", + "state": "", + "wikidata_qid": "" + }, + "ext_ids": {}, + "ident": "niivpohpabhajdsf35x7hr4efm", + "release_year": "2017", + "work_id": "7eghl5lcivfmha6d4uavrrkpce", + "extra": { + "crossref": {}, + "datacite": {}, + "skate": { + "status": "ref", + "ref": { + "index": 21, + "key": "\nkey\n\t\t\t\t20171225032503_CIT0020" + }, + "rg": {}, + "resolved_container_name": "" + }, + "ol": {} + } +} +``` diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index c8cd515..6421007 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -43,11 +43,11 @@ Overview -------- * raw input "tasks" as luigi.ExternalTask -* derivation +* derivationss Note: We mostly use some shell pipelines with UNIX and custom tools (see: skate); we may get rid of this "python layer" altogether, if we converged on what to -build. The most common pattern is map-reduce, e.g. derive a key from docs, +build. The most common pattern is "map-reduce", e.g. derive a key from docs, combine the results from e.g. two such key extractions and apply some reduction, e.g. output schema generation. @@ -63,7 +63,7 @@ Various schema Some operations, e.g. "fuzzy verification" require both compared documents to be release entities. This means, that we need to convert different formats into -the release format. +the release format at some point. Mappers ------- |