diff options
23 files changed, 330 insertions, 32 deletions
diff --git a/extra/stats/2019-01-07-prod-stats.json b/extra/stats/2019-01-07-prod-stats.json new file mode 100644 index 00000000..6a1d93a1 --- /dev/null +++ b/extra/stats/2019-01-07-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":3091330,"timestamp":"2020-01-08T00:06:10.718421+00:00"}},"container":{"total":147021},"papers":{"in_kbart":60418765,"in_web":18284474,"in_web_not_kbart":8356539,"is_oa":10473319,"total":100551479},"release":{"refs_total":804237857,"total":124697305}} diff --git a/extra/stats/2019-01-07-prod-table-sizes.txt b/extra/stats/2019-01-07-prod-table-sizes.txt new file mode 100644 index 00000000..4c205d38 --- /dev/null +++ b/extra/stats/2019-01-07-prod-table-sizes.txt @@ -0,0 +1,35 @@ + table_name | table_size | indexes_size | total_size +--------------------------------------------------------------+------------+--------------+------------ + "public"."refs_blob" | 83 GB | 2875 MB | 86 GB + "public"."release_rev" | 47 GB | 29 GB | 76 GB + "public"."release_contrib" | 35 GB | 31 GB | 66 GB + "public"."release_edit" | 12 GB | 19 GB | 31 GB + "public"."work_edit" | 12 GB | 19 GB | 30 GB + "public"."release_ident" | 8257 MB | 14 GB | 22 GB + "public"."work_ident" | 8054 MB | 14 GB | 22 GB + "public"."abstracts" | 12 GB | 918 MB | 13 GB + "public"."file_rev_url" | 9398 MB | 3469 MB | 13 GB + "public"."work_rev" | 5226 MB | 5825 MB | 11 GB + "public"."release_ref" | 3950 MB | 5626 MB | 9577 MB + "public"."file_rev" | 3081 MB | 4450 MB | 7531 MB + "public"."file_edit" | 2544 MB | 3805 MB | 6348 MB + "public"."file_ident" | 1668 MB | 2360 MB | 4028 MB + "public"."file_rev_release" | 1538 MB | 2387 MB | 3926 MB + "public"."release_rev_abstract" | 1370 MB | 1697 MB | 3067 MB + "public"."creator_edit" | 702 MB | 942 MB | 1643 MB + "public"."creator_rev" | 695 MB | 719 MB | 1413 MB + "public"."creator_ident" | 474 MB | 648 MB | 1121 MB + "public"."editgroup" | 663 MB | 369 MB | 1032 MB + "public"."release_rev_extid" | 200 MB | 312 MB | 512 MB + "public"."changelog" | 187 MB | 205 MB | 393 MB + "public"."container_rev" | 75 MB | 22 MB | 97 MB + "public"."container_edit" | 24 MB | 31 MB | 55 MB + "public"."container_ident" | 11 MB | 19 MB | 29 MB + "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."editor" | 16 kB | 48 kB | 64 kB + "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB + "public"."auth_oidc" | 16 kB | 48 kB | 64 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB +[...] diff --git a/guide/src/entity_release.md b/guide/src/entity_release.md index f6233c13..242a3c72 100644 --- a/guide/src/entity_release.md +++ b/guide/src/entity_release.md @@ -24,11 +24,11 @@ publicly available. Blank if only year is known. - `release_year` (integer): year when this release was first made publicly available; should match `release_date` if both are known. -- `withdrawn_status` (string, controlled set): -- `release_date` (string, ISO date format): when this release was first made - publicly available. Blank if only year is known. -- `release_year` (integer): year when this release was first made - publicly available; should match `release_date` if both are known. +- `withdrawn_status` (optional, string, controlled set): +- `withdrawn_date` (optional, string, ISO date format): when this release was withdrawn. + Blank if only year is known. +- `withdrawn_year` (optional, integer): year when this release was withdrawn; should + match `withdrawn_date` if both are known. - `ext_ids` (key/value object of string-to-string mappings): external identifiers. At least an empty `ext_ids` object is always required for release entities, so individual identifiers can be accessed directly. diff --git a/notes/bulk_edits/2019-12-20_orcid.md b/notes/bulk_edits/2019-12-20_orcid.md new file mode 100644 index 00000000..33dde32f --- /dev/null +++ b/notes/bulk_edits/2019-12-20_orcid.md @@ -0,0 +1,43 @@ + +Newer ORCID dumps are XML, not JSON. But there is a conversion tool! + + https://github.com/ORCID/orcid-conversion-lib + +Commands: + + wget https://github.com/ORCID/orcid-conversion-lib/raw/master/target/orcid-conversion-lib-0.0.2-full.jar + java -jar orcid-conversion-lib-0.0.2-full.jar OPTIONS + + java -jar orcid-conversion-lib-0.0.2-full.jar --tarball -i ORCID_2019_summaries.tar.gz -v v3_0rc1 -o ORCID_2019_summaries_json.tar.gz + + # [...] + # Sat Dec 21 04:43:50 UTC 2019 done 7300000 + # Sat Dec 21 04:44:08 UTC 2019 done 7310000 + # Sat Dec 21 04:44:17 UTC 2019 finished errors 0 + +Importing in QA, ran in to some lines like: + + {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0003-0014-6598","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"} + {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0003-3750-5654","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"} + {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0003-1424-4826","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"} + {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0002-5340-9665","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"} + +Needed to patch to filter those out. Then ran ok like: + + zcat /srv/fatcat/datasets/ORCID_2019_summaries.sample_10k.json.gz | ./fatcat_import.py orcid - + Counter({'total': 10000, 'exists': 5323, 'insert': 4493, 'skip': 184, 'skip-no-person': 160, 'update': 0}) + +New dump is about 7.3 million rows, so expecting about 3.2 million new +entities, 250k skips. + +Doing bulk run like: + + time zcat /srv/fatcat/datasets/ORCID_2019_summaries.json.gz | parallel -j8 --round-robin --pipe ./fatcat_import.py orcid - + +Prod timing: + + Counter({'total': 910643, 'exists': 476812, 'insert': 416583, 'skip': 17248, 'update': 0}) + + real 47m27.658s + user 245m44.272s + sys 14m50.836s diff --git a/notes/bulk_edits/2019-12-20_updates.md b/notes/bulk_edits/2019-12-20_updates.md index a8f62ea9..83c8d9da 100644 --- a/notes/bulk_edits/2019-12-20_updates.md +++ b/notes/bulk_edits/2019-12-20_updates.md @@ -80,3 +80,13 @@ x fix bad DOI error (real error, skip these) x remove newline after "unparsable medline date" error x remove extra line like "existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))" in warning +## Chocula + +Command: + + export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...] + ./fatcat_import.py chocula /srv/fatcat/datasets/export_fatcat.2019-12-26.json + +Result: + + Counter({'total': 144455, 'exists': 139807, 'insert': 2384, 'skip': 2264, 'skip-unknown-new-issnl': 2264, 'exists-by-issnl': 306, 'update': 0}) diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md index 80760938..2db0c72d 100644 --- a/notes/bulk_edits/CHANGELOG.md +++ b/notes/bulk_edits/CHANGELOG.md @@ -9,8 +9,19 @@ this file should probably get merged into the guide at some point. This file should not turn in to a TODO list! +## 2020-01 + +Imported around 2,500 new containers (journals, by ISSN-L) from chocula +analysis script. + ## 2019-12 +Started continuous harvesting Datacite DOI metadata; first date harvested was +`2019-12-13`. No importer running yet. + +Imported about 3.3m new ORCID identifiers from 2019 bulk dump (after converting +from XML to JSON): <https://archive.org/details/orcid-dump-2019> + Inserted about 154k new arxiv release entities. Still no automatic daily harvesting. @@ -45,22 +56,9 @@ invalid ISSN checksum). Imported files (matched to releases by DOI) from Semantic Scholar (`DIRECT-OA-CRAWL-2019` crawl). - Arabesque importer - crawl-bot - `s2_doi.sqlite` - TODO: archive.org link - TODO: rough count - TODO: date - Imported files (matched to releases by DOI) from pre-1923/pre-1909 items uploaded by a user to archive.org. - Matched importer - internetarchive-bot (TODO:) - TODO: archive.org link - TODO: counts - TODO: date - Imported files (matched to releases by DOI) from CORE.ac.uk (`DIRECT-OA-CRAWL-2019` crawl). diff --git a/proposals/20190509_schema_tweaks.md b/proposals/20190509_v03_schema_tweaks.md index 7e372959..150ce525 100644 --- a/proposals/20190509_schema_tweaks.md +++ b/proposals/20190509_v03_schema_tweaks.md @@ -1,4 +1,6 @@ +Status: implemented + # SQL (and API) schema changes Intend to make these changes at the same time as bumping OpenAPI schema from @@ -139,4 +141,4 @@ Do these as separate commits, after merging back in to master, for v0.3: `release_month`: apprently pretty common to know the year and month but not date. I have avoided so far, seems like unnecessary complexity. Could start -as an `extra_json` field? +as an `extra_json` field? NOT IMPLEMENTED diff --git a/proposals/20190510_editgroup_endpoint_prefix.md b/proposals/20190510_editgroup_endpoint_prefix.md index f517383b..6794266e 100644 --- a/proposals/20190510_editgroup_endpoint_prefix.md +++ b/proposals/20190510_editgroup_endpoint_prefix.md @@ -1,4 +1,6 @@ +Status: implemented + # Editgroup API Endpoint Prefixes In summary, change the API URL design such that entity mutations (create, diff --git a/proposals/20190510_release_ext_ids.md b/proposals/20190510_release_ext_ids.md index 1d2b912a..8953448c 100644 --- a/proposals/20190510_release_ext_ids.md +++ b/proposals/20190510_release_ext_ids.md @@ -1,4 +1,6 @@ +Status: implemented + # Release External ID Refactor Goal is to make the external identifier "namespace" (number of external diff --git a/proposals/20190514_fatcat_identifiers.md b/proposals/20190514_fatcat_identifiers.md new file mode 100644 index 00000000..325e48f5 --- /dev/null +++ b/proposals/20190514_fatcat_identifiers.md @@ -0,0 +1,27 @@ + +Status: brainstorm + +Fatcat Identifiers +======================= + +AKA, `fcid` + +## Public Use / Reference + +When referencing identifiers in external databases, should prefix with the +entity type. Eg: + + release_hsmo6p4smrganpb3fndaj2lon4 + editgroup_qinmjr2lbvgd3mbt7mifir23fy + +Or with a prefix: + + fatcat:release_hsmo6p4smrganpb3fndaj2lon4 + +As a usability affordance, the public web interface (though not API) should do +permanent redirects HTTP (301 or 308) to the canonical page like: + + https://fatcat.wiki/release_hsmo6p4smrganpb3fndaj2lon4 + HTTP 301 => https://fatcat.wiki/release/hsmo6p4smrganpb3fndaj2lon4 + +However, no intention to use identifiers in this schema in the API itself? diff --git a/proposals/20190911_search_query_parsing.md b/proposals/20190911_search_query_parsing.md new file mode 100644 index 00000000..f1fb0128 --- /dev/null +++ b/proposals/20190911_search_query_parsing.md @@ -0,0 +1,28 @@ + +Status: brainstorm + +## Search Query Parsing + +The default "release" search on fatcat.wiki currently uses the elasticsearch +built-in `query_string` parser, which is explicitly not recommended for +public/production use. + +The best way forward is likely a custom query parser (eg, PEG-generated parser) +that generates a complete elasticsearch query JSON structure. + +A couple search issues this would help with: + +- better parsing of keywords (year, year-range, DOI, ISSN, etc) in complex + queries and turning these in to keyword term sub-queries +- queries including terms from multiple fields which aren't explicitly tagged + (eg, "lovelace computer" vs. "author:lovelace title:computer") +- avoiding unsustainably expensive queries (eg, prefix wildcard, regex) +- handling single-character mispellings and synonyms +- collapsing multiple releases under the same work in search results + +In the near future, we may also create a fulltext search index, which will have +it's own issues. + +## Tech Changes + +If we haven't already, should also switch to using elasticsearch client library. diff --git a/proposals/20190911_v04_schema_tweaks.md b/proposals/20190911_v04_schema_tweaks.md index 8ccbac79..eaf39474 100644 --- a/proposals/20190911_v04_schema_tweaks.md +++ b/proposals/20190911_v04_schema_tweaks.md @@ -1,5 +1,7 @@ -status: work-in-progress +Status: planned + +## Schema Changes for v0.4 Release Proposed schema changes for next fatcat iteration (v0.4? v0.5?). @@ -17,6 +19,9 @@ SQL (and API, and elasticsearch): - TODO: release: switch how pages work? first/last? - TODO: indication of peer-review process? at release or container level? - TODO: container: separate canonical and disambiguating titles (?) +- TODO: release inter-references using SCHOLIX/Datacite schema + https://zenodo.org/record/1120265 + https://support.datacite.org/docs/connecting-research-outputs#section-related-identifiers API tweaks: diff --git a/proposals/20191018_bigger_db.md b/proposals/20191018_bigger_db.md index cd5f6e7b..7a5216d0 100644 --- a/proposals/20191018_bigger_db.md +++ b/proposals/20191018_bigger_db.md @@ -1,4 +1,8 @@ +Status: brainstorm + +## Catalog Database Scaling + How can we scale the fatcat backend to support: - one billion release entities diff --git a/proposals/20200103_py37_refactors.md b/proposals/20200103_py37_refactors.md new file mode 100644 index 00000000..f0321b33 --- /dev/null +++ b/proposals/20200103_py37_refactors.md @@ -0,0 +1,101 @@ + +status: planning + +If we update fatcat python code to python3.7, what code refactoring changes can +we make? We currently use/require python3.5. + +Nice features in python3 I know of are: + +- dataclasses (python3.7) +- async/await (mature in python3.7?) +- type annotations (python3.5) +- format strings (python3.6) +- walrus assignment (python3.8) + +Not sure if the walrus operator is worth jumping all the way to python3.8. + +While we might be at it, what other superficial factorings might we want to do? + +- strict lint style (eg, maximum column width) with `black` (python3.6) +- logging/debugging/verbose +- type annotations and checking +- use named dicts or structs in place of dicts + +## Linux Distro Support + +The default python version shipped by current and planned linux releases are: + +- ubuntu xenial 16.04 LTS: python3.5 +- ubuntu bionic 18.04 LTS: python3.6 +- ubuntu focal 20.04 LTS: python3.8 (planned) +- debian buster 10 2019: python3.7 + +Python 3.7 is the default in debian buster (10). + +There are apt PPA package repositories that allow backporting newer pythons to +older releases. As far as I know this is safe and doesn't override any system +usage if we are careful not to set the defaults (aka, `python3` command should +be the older version unless inside a virtualenv). + +It would also be possible to use `pyenv` to have `virtualenv`s with custom +python versions. We should probably do that for OS X and/or windows support if +we wanted those. But having a system package is probably a lot faster to +install. + +## Dataclasses + +`dataclasses` are a user-friendly way to create struct-like objects. They are +pretty similar to the existing `namedtuple`, but can be mutable and have +methods attached to them (they are just classes), plus several other usability +improvements. + +Most places we are throwing around dicts with structure we could be using +dataclasses instead. There are some instances of this in fatcat, but many more +in sandcrawler. + +## Async/Await + +Where might we actually use async/await? I think more in sandcrawler than in +the python tools or web apps. The GROBID, ingest, and ML workers in particular +should be async over batches, as should all fetches from CDX/wayback. + +Some of the kafka workers *could* be aync, but i'm not sure how much speedup +there would actually be. For example, the entity updates worker could fetch +entities for an editgroup concurrently. + +Inserts (importers) should probably mostly happen serially, at least the kafka +importers, one editgroup at a time, so progress is correctly recorded in kafka. +Parallelization should probably happen at the partition level; would need to +think through whether async would actually help with code simplicity vs. thread +or process parallelization. + +## Type Annotations + +The meta-goals of (gradual) type annotations would be catching more bugs at +development time, and having code be more self-documenting and easier to +understand. + +The two big wins I see with type annotation would be having annotations +auto-generated for the openapi classes and API calls, and to make string +munging in importer code less buggy. + +## Format Strings + +Eg, replace code like: + + "There are {} out of {} objects".format(found, total) + +With: + + f"There are {found} out of {total} objects" + +## Walrus Operator + +New operator allows checking and assignment together: + + if (n := len(a)) > 10: + print(f"List is too long ({n} elements, expected <= 10)") + +I feel like we would actually use this pattern *a ton* in importer code, where +we do a lot of lookups or cleaning then check if we got a `None`. + diff --git a/proposals/README.md b/proposals/README.md new file mode 100644 index 00000000..5e6747b1 --- /dev/null +++ b/proposals/README.md @@ -0,0 +1,11 @@ + +This folder contains proposals for larger changes to the fatcat system. These +might be schema changes, new projects, technical details, etc. Any change which +is large enough to require planning and documentation. + +Each should be tagged with a date first drafted, and labeled with a status: + +- brainstorm: just putting ideas down; might not even happen +- planned: commited to happening, but not started yet +- work-in-progress: currently being worked on +- implemented: completed, merged to master/production/live diff --git a/python/Pipfile b/python/Pipfile index 01c1eb3d..1a19a145 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -8,7 +8,7 @@ verify_ssl = true name = "pypi" [dev-packages] -pytest = ">=4,<5.0.0" +pytest = ">=5,<6.0.0" pytest-pythonpath = "*" pytest-pylint = "*" ipython = "<7.0.0" diff --git a/python/Pipfile.lock b/python/Pipfile.lock index 35125b67..a4408cdd 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,11 @@ { "_meta": { "hash": { +<<<<<<< HEAD + "sha256": "03fc6c65c7bcbf96a5ef90afba8b6a0264a248a67b31ed339f399470b5f3d5fc" +======= "sha256": "fb9c3d2307483efe01d9c28a306bad319c84a94a4253d5c7c25bcfe2dad20c5d" +>>>>>>> martin-datacite-import }, "pipfile-spec": 6, "requires": { @@ -298,6 +302,8 @@ ], "version": "==2.5.0" }, +<<<<<<< HEAD +======= "langdetect": { "hashes": [ "sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30" @@ -305,6 +311,7 @@ "index": "pypi", "version": "==1.0.7" }, +>>>>>>> martin-datacite-import "loginpass": { "hashes": [ "sha256:717c87c1870a7e00547fd9d989aea9b22232b2f48826f552d79c34a47f9618c9", @@ -617,7 +624,12 @@ }, "wcwidth": { "hashes": [ +<<<<<<< HEAD + "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603", + "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8" +======= "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603" +>>>>>>> martin-datacite-import ], "version": "==0.1.8" }, @@ -645,13 +657,6 @@ ], "version": "==2.3.3" }, - "atomicwrites": { - "hashes": [ - "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", - "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" - ], - "version": "==1.3.0" - }, "attrs": { "hashes": [ "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", @@ -805,7 +810,6 @@ "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d", "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564" ], - "markers": "python_version > '2.7'", "version": "==8.0.2" }, "packaging": { @@ -923,11 +927,19 @@ }, "pytest": { "hashes": [ +<<<<<<< HEAD + "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa", + "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4" + ], + "index": "pypi", + "version": "==5.3.2" +======= "sha256:6192875be8af57b694b7c4904e909680102befcb99e610ef3d9f786952f795aa", "sha256:f8447ebf8fd3d362868a5d3f43a9df786dfdfe9608843bd9002a2d47a104808f" ], "index": "pypi", "version": "==4.6.8" +>>>>>>> martin-datacite-import }, "pytest-cov": { "hashes": [ @@ -1032,7 +1044,12 @@ }, "wcwidth": { "hashes": [ +<<<<<<< HEAD + "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603", + "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8" +======= "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603" +>>>>>>> martin-datacite-import ], "version": "==0.1.8" }, diff --git a/python/fatcat_import.py b/python/fatcat_import.py index ea7e12f2..fb8830ca 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -61,7 +61,8 @@ def run_journal_metadata(args): def run_chocula(args): fii = ChoculaImporter(args.api, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + do_updates=args.do_updates) JsonLinePusher(fii, args.json_file).run() def run_matched(args): @@ -315,6 +316,9 @@ def main(): sub_chocula.add_argument('json_file', help="chocula JSON entities file (or stdin)", default=sys.stdin, type=argparse.FileType('r')) + sub_chocula.add_argument('--do-updates', + action='store_true', + help="update pre-existing container entities") sub_matched = subparsers.add_parser('matched', help="add file entities matched against existing releases; custom JSON format") diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index acfc2b87..c71b33e9 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -47,6 +47,7 @@ class ArabesqueMatchImporter(EntityImporter): eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter') if kwargs.get('crawl_id'): eg_extra['crawl_id'] = kwargs.get('crawl_id') + kwargs['do_updates'] = kwargs.get("do_updates", False) super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, @@ -56,7 +57,6 @@ class ArabesqueMatchImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel self.default_mimetype = kwargs.get("default_mimetype", None) - self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid if self.require_grobid: print("Requiring GROBID status == 200") diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index eea50314..375b6051 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -109,6 +109,9 @@ class ChoculaImporter(EntityImporter): # decide whether to update do_update = False + if not self.do_updates: + self.counts['exists'] += 1 + return False if not existing.extra: existing.extra = dict() if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index be5db8d8..8d103372 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -287,6 +287,7 @@ class EntityImporter: eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter') self.api = api + self.do_updates = bool(kwargs.get('do_updates', True)) self.bezerk_mode = kwargs.get('bezerk_mode', False) self.submit_mode = kwargs.get('submit_mode', False) self.edit_batch_size = kwargs.get('edit_batch_size', 100) diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 33c40eff..16643eb5 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -14,13 +14,13 @@ class IngestFileResultImporter(EntityImporter): eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool" eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') + kwargs['do_updates'] = kwargs.get("do_updates", False) super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel - self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid if self.require_grobid: print("Requiring GROBID status == 200") diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3611a299..c32ce34a 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -715,6 +715,10 @@ class PubmedImporter(EntityImporter): re.ext_ids.doi = None re.work_id = existing.work_id + if existing and not self.do_updates: + self.counts['exists'] += 1 + return False + if existing and existing.ext_ids.pmid and (existing.refs or not re.refs): # TODO: any other reasons to do an update? # don't update if it already has PMID |