aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/notes/version_4.md161
-rw-r--r--python/refcat/tasks.py6
2 files changed, 164 insertions, 3 deletions
diff --git a/python/notes/version_4.md b/python/notes/version_4.md
index bf3f480..83d397b 100644
--- a/python/notes/version_4.md
+++ b/python/notes/version_4.md
@@ -567,3 +567,164 @@ on page one of references, there is "Gaussian processes in machine learning"
https://www.researchgate.net/profile/Olivier_Bousquet/publication/238718428_Advanced_Lectures_on_Machine_Learning_ML_Summer_Schools_2003_Canberra_Australia_February_2-14_2003_Tubingen_Germany_August_4-16_2003_Revised_Lectures/links/02e7e52c5870850311000000/Advanced-Lectures-on-Machine-Learning-ML-Summer-Schools-2003-Canberra-Australia-February-2-14-2003-Tuebingen-Germany-August-4-16-2003-Revised-Lectures.pdf#page=70
- the paper itself does not contain a reference -- in the whole document.
+
+## OL fuzzy different
+
+Reasons, why pairs were marked as *different*:
+
+```
+$ zstdcat -T UnmatchedOpenLibraryMatchTable/date-2021-05-06.tsv.zst | grep ^different | cut -f2 | LC_ALL=C sort -S50% | uniq -c | sort -nr
+47324670 year
+46016349 contribintersectionempty
+ 582618 pagecount
+ 460 titlefilename
+ 25 numdiff
+```
+
+The `year` may refer to different editions:
+
+* https://fatcat.wiki/release/kngofkvoo5cinj4wqerrey4tpi/references
+* https://openlibrary.org/works/OL16286792W/One_hundred_and_seventeen_days?edition=onehundredsevent00firs
+
+> 117 Days: An Account of Confinement and Interrogation under the South African
+> 90-Day Detention Law.2006 | vs This edition was published in 1965 by Penguin
+> Books
+
+## Data mismatch
+
+* FE: https://fatcat.wiki/release/niivpohpabhajdsf35x7hr4efm/references, [8]: 2011
+
+refs (2017 only)
+
+```
+{
+ "container_name": "19 & 20: Notes for a New Social Protagonism",
+ "container": {
+ "container_type": "",
+ "ident": "",
+ "issnl": "",
+ "name": "",
+ "publisher": "",
+ "revision": "",
+ "state": "",
+ "wikidata_qid": ""
+ },
+ "contribs": [
+ {
+ "raw_name": "Colective Situaciones"
+ }
+ ],
+ "ext_ids": {},
+ "ident": "niivpohpabhajdsf35x7hr4efm",
+ "release_year": "2017",
+ "work_id": "7eghl5lcivfmha6d4uavrrkpce",
+ "extra": {
+ "crossref": {},
+ "datacite": {},
+ "skate": {
+ "status": "ref",
+ "ref": {
+ "index": 7,
+ "key": "\nkey\n\t\t\t\t20171225032503_CIT0007"
+ },
+ "rg": {},
+ "resolved_container_name": ""
+ },
+ "ol": {}
+ }
+}
+{
+ "container_name": "A Dictionary of Marxist Thought (2nd ed.)",
+ "container": {
+ "container_type": "",
+ "ident": "",
+ "issnl": "",
+ "name": "",
+ "publisher": "",
+ "revision": "",
+ "state": "",
+ "wikidata_qid": ""
+ },
+ "ext_ids": {},
+ "ident": "niivpohpabhajdsf35x7hr4efm",
+ "release_year": "2017",
+ "title": "Price of production and the transformation problem",
+ "work_id": "7eghl5lcivfmha6d4uavrrkpce",
+ "extra": {
+ "crossref": {},
+ "datacite": {},
+ "skate": {
+ "status": "ref",
+ "ref": {
+ "index": 12,
+ "key": "\nkey\n\t\t\t\t20171225032503_CIT0012"
+ },
+ "rg": {},
+ "resolved_container_name": ""
+ },
+ "ol": {}
+ }
+}
+{
+ "container_name": "A Grammar of the Multitude: For an Analysis of Contemporary Forms of Life",
+ "container": {
+ "container_type": "",
+ "ident": "",
+ "issnl": "",
+ "name": "",
+ "publisher": "",
+ "revision": "",
+ "state": "",
+ "wikidata_qid": ""
+ },
+ "ext_ids": {},
+ "ident": "niivpohpabhajdsf35x7hr4efm",
+ "release_year": "2017",
+ "work_id": "7eghl5lcivfmha6d4uavrrkpce",
+ "extra": {
+ "crossref": {},
+ "datacite": {},
+ "skate": {
+ "status": "ref",
+ "ref": {
+ "index": 45,
+ "key": "\nkey\n\t\t\t\t20171225032503_CIT0044"
+ },
+ "rg": {},
+ "resolved_container_name": ""
+ },
+ "ol": {}
+ }
+}
+{
+ "container_name": "An Introduction to the Three Volumes of Karl Marx's Capital",
+ "container": {
+ "container_type": "",
+ "ident": "",
+ "issnl": "",
+ "name": "",
+ "publisher": "",
+ "revision": "",
+ "state": "",
+ "wikidata_qid": ""
+ },
+ "ext_ids": {},
+ "ident": "niivpohpabhajdsf35x7hr4efm",
+ "release_year": "2017",
+ "work_id": "7eghl5lcivfmha6d4uavrrkpce",
+ "extra": {
+ "crossref": {},
+ "datacite": {},
+ "skate": {
+ "status": "ref",
+ "ref": {
+ "index": 21,
+ "key": "\nkey\n\t\t\t\t20171225032503_CIT0020"
+ },
+ "rg": {},
+ "resolved_container_name": ""
+ },
+ "ol": {}
+ }
+}
+```
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index c8cd515..6421007 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -43,11 +43,11 @@ Overview
--------
* raw input "tasks" as luigi.ExternalTask
-* derivation
+* derivationss
Note: We mostly use some shell pipelines with UNIX and custom tools (see: skate); we
may get rid of this "python layer" altogether, if we converged on what to
-build. The most common pattern is map-reduce, e.g. derive a key from docs,
+build. The most common pattern is "map-reduce", e.g. derive a key from docs,
combine the results from e.g. two such key extractions and apply some
reduction, e.g. output schema generation.
@@ -63,7 +63,7 @@ Various schema
Some operations, e.g. "fuzzy verification" require both compared documents to
be release entities. This means, that we need to convert different formats into
-the release format.
+the release format at some point.
Mappers
-------