diff options
22 files changed, 768 insertions, 96 deletions
diff --git a/extra/stats/2020-02-19-prod-stats.json b/extra/stats/2020-02-19-prod-stats.json new file mode 100644 index 00000000..a2313233 --- /dev/null +++ b/extra/stats/2020-02-19-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":3509511,"timestamp":"2020-02-20T01:42:50.980212+00:00"}},"container":{"total":148356},"papers":{"in_kbart":60523853,"in_web":19616767,"in_web_not_kbart":8937938,"is_oa":11524180,"total":105665352},"release":{"refs_total":889522285,"total":143709455}} diff --git a/extra/stats/2020-02-19-prod-table-sizes.txt b/extra/stats/2020-02-19-prod-table-sizes.txt new file mode 100644 index 00000000..cab2b52e --- /dev/null +++ b/extra/stats/2020-02-19-prod-table-sizes.txt @@ -0,0 +1,46 @@ +Size: 476.74G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 53 GB | 43 GB | 96 GB + "public"."release_rev" | 58 GB | 33 GB | 91 GB + "public"."refs_blob" | 85 GB | 2884 MB | 88 GB + "public"."release_edit" | 14 GB | 20 GB | 34 GB + "public"."work_edit" | 13 GB | 20 GB | 34 GB + "public"."release_ident" | 9504 MB | 15 GB | 24 GB + "public"."work_ident" | 9302 MB | 15 GB | 24 GB + "public"."abstracts" | 16 GB | 1501 MB | 18 GB + "public"."file_rev_url" | 9980 MB | 3550 MB | 13 GB + "public"."work_rev" | 6038 MB | 5825 MB | 12 GB + "public"."release_ref" | 3997 MB | 5690 MB | 9686 MB + "public"."file_rev" | 3472 MB | 5103 MB | 8574 MB + "public"."file_edit" | 2934 MB | 3959 MB | 6893 MB + "public"."release_rev_abstract" | 2402 MB | 3339 MB | 5742 MB + "public"."file_ident" | 1795 MB | 2437 MB | 4231 MB + "public"."file_rev_release" | 1651 MB | 2428 MB | 4078 MB + "public"."creator_edit" | 702 MB | 942 MB | 1643 MB + "public"."creator_rev" | 695 MB | 719 MB | 1413 MB + "public"."editgroup" | 761 MB | 404 MB | 1164 MB + "public"."creator_ident" | 474 MB | 648 MB | 1121 MB + "public"."release_rev_extid" | 200 MB | 312 MB | 512 MB + "public"."changelog" | 218 MB | 214 MB | 432 MB + "public"."container_rev" | 75 MB | 23 MB | 98 MB + "public"."container_edit" | 25 MB | 31 MB | 56 MB + "public"."container_ident" | 11 MB | 19 MB | 30 MB + "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."auth_oidc" | 16 kB | 48 kB | 64 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."editor" | 16 kB | 48 kB | 64 kB + "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB + "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) diff --git a/proposals/2020_sql_size_reduction.md b/proposals/2020_sql_size_reduction.md index f421e455..2fa39873 100644 --- a/proposals/2020_sql_size_reduction.md +++ b/proposals/2020_sql_size_reduction.md @@ -52,6 +52,8 @@ Other growth is expected to be much smaller, let's say a few GB of disk. This works out to a bit over 600 GByte total disk size. +NOTE: math was wrong? 470 + 80 + 100 -> 650 GByte, call it 700 GByte + ## Idea: finish `ext_id` migration and drop columns+index from `release_rev` @@ -172,3 +174,17 @@ would drop ~20% of data size and ~20% of index size. Would it make more sense to use {ident, editgroup} as the primary key and UNIQ, then have a separate index on `editgroup`? On the assumption that `editgroup` cardinality is much smaller, thus the index disk usage would be smaller. + +## Idea: use binary for hashes + +We currently store file hashes (SHA-1, SHA-256, MD5) and abstracts/`ref_blobs` +keys as TEXT in lower-case hex encoding. Using binary instead could be as much +as a 50% size savings for both column and index storage. The difference becomes +more apparent when all files have all hashes populated. + +base32 encoded strings would be smaller (but non-negligable) savings. + +This change has a reasonable migration path, is entirely internal to postgres +and fatcatd, and would be no change to API schema. Postgres also allows `hex` +encoding on `bytea` data type, which can make reading/debugging reasonable. + diff --git a/python/fatcat_import.py b/python/fatcat_import.py index ad4de0e2..843685aa 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -166,6 +166,11 @@ def run_grobid_metadata(args): bezerk_mode=args.bezerk_mode) LinePusher(fmi, args.tsv_file).run() +def run_shadow_lib(args): + fmi = ShadowLibraryImporter(args.api, + edit_batch_size=100) + JsonLinePusher(fmi, args.json_file).run() + def run_wayback_static(args): api = args.api @@ -473,6 +478,16 @@ def main(): action='store_true', help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") + sub_shadow_lib = subparsers.add_parser('shadow-lib', + help="create release and file entities based on GROBID PDF metadata extraction") + sub_shadow_lib.set_defaults( + func=run_shadow_lib, + auth_var="FATCAT_AUTH_WORKER_SHADOW", + ) + sub_shadow_lib.add_argument('json_file', + help="JSON file to import from (or stdin)", + default=sys.stdin, type=argparse.FileType('r')) + sub_wayback_static = subparsers.add_parser('wayback-static', help="crude crawl+ingest tool for single-page HTML docs from wayback") sub_wayback_static.set_defaults( diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py index 6ce36974..6fda74c5 100755 --- a/python/fatcat_ingest.py +++ b/python/fatcat_ingest.py @@ -11,7 +11,7 @@ import argparse from collections import Counter import raven import elasticsearch -from elasticsearch_dsl import Search +from elasticsearch_dsl import Search, Q from fatcat_tools import public_api, simple_kafka_producer, kafka_fail_fast from fatcat_tools.transforms import release_ingest_request @@ -21,45 +21,54 @@ from fatcat_tools.transforms import release_ingest_request sentry_client = raven.Client() -def run_ingest_container(args): - """ - This command queries elasticsearch for releases from a given container (eg, - journal), and prepares ingest requests for them. - - By default it filters to releases which don't have any fulltext files - archived in IA, and dumps the ingest requests as JSON. - """ +def _init_search(args): # ensure API connection works args.api.get_changelog() + client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint) + search = Search(using=client, index="fatcat_release") + return search + + +def _run_search_dump(args, search): + + if args.dry_run: + print("=== THIS IS A DRY RUN ===") + kafka_producer = None ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests".format(args.env) if args.enqueue_kafka: print("Will send ingest requests to kafka topic: {}".format(ingest_file_request_topic), file=sys.stderr) kafka_producer = simple_kafka_producer(args.kafka_hosts) - client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint) - - s = Search(using=client, index="fatcat_release") \ - .filter("term", in_ia=False) \ - .filter("term", is_oa=True) - - # filter/query by container - if args.container_id: - s = s.filter("term", container_id=args.container_id) - elif args.issnl: - s = s.filter("term", container_issnl=args.issnl) - elif args.publisher: - s = s.query("match", publisher=args.publisher) - elif args.name: - s = s.query("match", container_name=args.name) + if args.limit is not None: + search = search[:args.limit] + + if args.before_year: + search = search \ + .filter("exists", field="release_year") \ + .filter("range", release_date=dict(lt=args.before_year)) + if args.after_year: + search = search \ + .filter("exists", field="release_year") \ + .filter("range", release_date=dict(gte=args.after_year)) + + if not args.allow_non_oa: + search = search.filter("term", is_oa=True) + + if args.release_types: + release_types = args.release_types.split(',') + search = search \ + .filter("terms", release_type=release_types) else: - print("You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr) - sys.exit(-1) + search = search \ + .filter("bool", must_not=[ + Q("terms", release_type=["stub", "component"]) + ]) counts = Counter({'ingest_request': 0, 'elasticsearch_release': 0, 'estimate': 0}) - counts['estimate'] = s.count() + counts['estimate'] = search.count() print("Expecting {} release objects in search queries".format(counts['estimate']), file=sys.stderr) # don't try to clean up scroll if we are connected to public server (behind @@ -67,19 +76,24 @@ def run_ingest_container(args): if args.elasticsearch_endpoint in ( 'https://search.fatcat.wiki', 'https://search.qa.fatcat.wiki'): - s = s.params(clear_scroll=False) + search = search.params(clear_scroll=False) - results = s.scan() + results = search.scan() for esr in results: + if args.limit and counts['ingest_request'] >= args.limit: + break counts['elasticsearch_release'] += 1 release = args.api.get_release(esr.ident) ingest_request = release_ingest_request( release, - ingest_request_source="fatcat-ingest-container", + ingest_request_source="fatcat-ingest", ) if not ingest_request: continue - if kafka_producer != None: + counts['ingest_request'] += 1 + if args.dry_run: + continue + if kafka_producer is not None: kafka_producer.produce( ingest_file_request_topic, json.dumps(ingest_request).encode('utf-8'), @@ -87,12 +101,73 @@ def run_ingest_container(args): on_delivery=kafka_fail_fast, ) counts['kafka'] += 1 - # also printing to stdout when in kafka mode; could skip? - print(json.dumps(ingest_request)) - counts['ingest_request'] += 1 - if kafka_producer != None: + else: + print(json.dumps(ingest_request)) + if kafka_producer is not None: kafka_producer.flush() print(counts, file=sys.stderr) + if args.dry_run: + print("=== THIS WAS A DRY RUN ===") + + +def run_ingest_container(args): + """ + This command queries elasticsearch for releases from a given container (eg, + journal), and prepares ingest requests for them. + + By default it filters to releases which don't have any fulltext files + archived in IA, and dumps the ingest requests as JSON. + """ + + search = _init_search(args).filter("term", in_ia=False) + + # filter/query by container + if args.container_id: + search = search.filter("term", container_id=args.container_id) + elif args.issnl: + search = search.filter("term", container_issnl=args.issnl) + elif args.publisher: + search = search.query("match", publisher=args.publisher) + elif args.name: + search = search.query("match", container_name=args.name) + else: + print("You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr) + sys.exit(-1) + + return _run_search_dump(args, search) + + +def run_ingest_query(args): + """ + Accepts a free-form Lucene query language string. Intended to work the same + way as searches in the fatcat web interface. + """ + + search = _init_search(args) \ + .filter("term", in_ia=False) \ + .query( + "query_string", + query=args.query, + default_operator="AND", + analyze_wildcard=True, + lenient=True, + fields=["title^5", "contrib_names^2", "container_title"], + ) + + return _run_search_dump(args, search) + + +def run_ingest_extid(args): + """ + Selects release entities where the external identifier (extid) exists + """ + + search = _init_search(args) \ + .filter("term", in_ia=False) \ + .filter("exists", field=args.extid) + + return _run_search_dump(args, search) + def main(): parser = argparse.ArgumentParser( @@ -112,20 +187,51 @@ def main(): parser.add_argument('--env', default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)") + parser.add_argument('--limit', + default=None, + type=int, + help="Max number of search hits to return") + parser.add_argument('--dry-run', + action='store_true', + help="runs through creating all ingest requests, but doesn't actually output or enqueue") + parser.add_argument('--before-year', + type=str, + help="filters results to only with release_year before this (not inclusive)") + parser.add_argument('--after-year', + type=str, + help="filters results to only with release_year after this (inclusive)") + parser.add_argument('--release-types', + type=str, + help="filters results to specified release-types, separated by commas. By default, 'stub' is filtered out.") + parser.add_argument('--allow-non-oa', + action='store_true', + help="By default, we limit to OA releases. This removes that filter") subparsers = parser.add_subparsers() - sub_ingest_container = subparsers.add_parser('ingest-container', + sub_container = subparsers.add_parser('container', help="Create ingest requests for releases from a specific container") - sub_ingest_container.set_defaults(func=run_ingest_container) - sub_ingest_container.add_argument('--container-id', + sub_container.set_defaults(func=run_ingest_container) + sub_container.add_argument('--container-id', help="fatcat container entity ident") - sub_ingest_container.add_argument('--issnl', + sub_container.add_argument('--issnl', help="ISSN-L of container entity") - sub_ingest_container.add_argument('--publisher', + sub_container.add_argument('--publisher', help="publisher name") - sub_ingest_container.add_argument('--name', + sub_container.add_argument('--name', help="container name") + sub_query = subparsers.add_parser('query', + help="Create ingest requests for releases from a specific query") + sub_query.set_defaults(func=run_ingest_query) + sub_query.add_argument('query', + help="search query (same DSL as web interface search)") + + sub_extid = subparsers.add_parser('extid', + help="Create ingest requests for releases that have given extid defined") + sub_extid.set_defaults(func=run_ingest_extid) + sub_extid.add_argument('extid', + help="extid short name (as included in ES release schema)") + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 33f44600..d2d71d3c 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -70,8 +70,8 @@ class HarvestCrossrefWorker: def fail_fast(err, msg): if err is not None: - print("Kafka producer delivery error: {}".format(err)) - print("Bailing out...") + print("Kafka producer delivery error: {}".format(err), file=sys.stderr) + print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(err) @@ -117,7 +117,7 @@ class HarvestCrossrefWorker: if http_resp.status_code == 503: # crude backoff; now redundant with session exponential # backoff, but allows for longer backoff/downtime on remote end - print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code)) + print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code), file=sys.stderr) # keep kafka producer connection alive self.producer.poll(0) time.sleep(30.0) @@ -131,7 +131,7 @@ class HarvestCrossrefWorker: items = self.extract_items(resp) count += len(items) print("... got {} ({} of {}), HTTP fetch took {}".format(len(items), count, - self.extract_total(resp), http_resp.elapsed)) + self.extract_total(resp), http_resp.elapsed), file=sys.stderr) #print(json.dumps(resp)) for work in items: self.producer.produce( @@ -156,7 +156,7 @@ class HarvestCrossrefWorker: while True: current = self.state.next(continuous) if current: - print("Fetching DOIs updated on {} (UTC)".format(current)) + print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr) self.fetch_date(current) self.state.complete(current, kafka_topic=self.state_topic, @@ -164,11 +164,11 @@ class HarvestCrossrefWorker: continue if continuous: - print("Sleeping {} seconds...".format(self.loop_sleep)) + print("Sleeping {} seconds...".format(self.loop_sleep), file=sys.stderr) time.sleep(self.loop_sleep) else: break - print("{} DOI ingest caught up".format(self.name)) + print("{} DOI ingest caught up".format(self.name), file=sys.stderr) class HarvestDataciteWorker(HarvestCrossrefWorker): diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index 78830a1c..310366bd 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -57,6 +57,10 @@ class HarvestState: if catchup_days or start_date or end_date: self.enqueue_period(start_date, end_date, catchup_days) + def __str__(self): + return '<HarvestState to_process={}, completed={}>'.format( + len(self.to_process), len(self.completed)) + def enqueue_period(self, start_date=None, end_date=None, catchup_days=14): """ This function adds a time period to the "TODO" list, unless the dates @@ -129,7 +133,7 @@ class HarvestState: def fail_fast(err, msg): if err: raise KafkaException(err) - print("Commiting status to Kafka: {}".format(kafka_topic)) + print("Commiting status to Kafka: {}".format(kafka_topic), file=sys.stderr) producer_conf = kafka_config.copy() producer_conf.update({ 'delivery.report.only.error': True, @@ -154,7 +158,7 @@ class HarvestState: if not kafka_topic: return - print("Fetching state from kafka topic: {}".format(kafka_topic)) + print("Fetching state from kafka topic: {}".format(kafka_topic), file=sys.stderr) def fail_fast(err, msg): if err: raise KafkaException(err) @@ -191,4 +195,4 @@ class HarvestState: # verify that we got at least to HWM assert c >= hwm[1] - print("... got {} state update messages, done".format(c)) + print("... got {} state update messages, done".format(c), file=sys.stderr) diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index f908ba83..11b5fa0a 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -49,13 +49,14 @@ class HarvestOaiPmhWorker: self.name = "unnamed" self.state = HarvestState(start_date, end_date) self.state.initialize_from_kafka(self.state_topic, self.kafka_config) + print(self.state, file=sys.stderr) def fetch_date(self, date): def fail_fast(err, msg): if err is not None: - print("Kafka producer delivery error: {}".format(err)) - print("Bailing out...") + print("Kafka producer delivery error: {}".format(err), file=sys.stderr) + print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(err) @@ -79,14 +80,14 @@ class HarvestOaiPmhWorker: 'until': date_str, }) except sickle.oaiexceptions.NoRecordsMatch: - print("WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str)) + print("WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str), file=sys.stderr) return count = 0 for item in records: count += 1 if count % 50 == 0: - print("... up to {}".format(count)) + print("... up to {}".format(count), file=sys.stderr) producer.produce( self.produce_topic, item.raw.encode('utf-8'), @@ -99,7 +100,7 @@ class HarvestOaiPmhWorker: while True: current = self.state.next(continuous) if current: - print("Fetching DOIs updated on {} (UTC)".format(current)) + print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr) self.fetch_date(current) self.state.complete(current, kafka_topic=self.state_topic, @@ -107,11 +108,11 @@ class HarvestOaiPmhWorker: continue if continuous: - print("Sleeping {} seconds...".format(self.loop_sleep)) + print("Sleeping {} seconds...".format(self.loop_sleep), file=sys.stderr) time.sleep(self.loop_sleep) else: break - print("{} OAI-PMH ingest caught up".format(self.name)) + print("{} OAI-PMH ingest caught up".format(self.name), file=sys.stderr) class HarvestArxivWorker(HarvestOaiPmhWorker): diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index d936605f..10557ef8 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat from .ingest import IngestFileResultImporter, SavePaperNowFileImporter +from .shadow import ShadowLibraryImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 1ffbd6e7..a84ce90f 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -194,6 +194,8 @@ DOMAIN_REL_MAP = { "www.scielo.cl": "repository", "www.scielo.org.mx": "repository", "zenodo.org": "repository", + "www.biorxiv.org": "repository", + "www.medrxiv.org": "repository", "citeseerx.ist.psu.edu": "aggregator", "publisher-connector.core.ac.uk": "aggregator", @@ -220,6 +222,13 @@ DOMAIN_REL_MAP = { "www.nature.com": "publisher", "www.pnas.org": "publisher", "www.tandfonline.com": "publisher", + "www.frontiersin.org": "publisher", + "www.degruyter.com": "publisher", + "www.mdpi.com": "publisher", + "www.ahajournals.org": "publisher", + "ehp.niehs.nih.gov": "publisher", + "journals.tsu.ru": "publisher", + "www.cogentoa.com": "publisher", "www.researchgate.net": "academicsocial", "academia.edu": "academicsocial", diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 2f77481a..4e382348 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -1,11 +1,11 @@ """ Prototype importer for datacite.org data. -Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8. +Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51 -Datacite being an aggregator, the data is varied and exposes a couple of -problems in content and structure. A few fields habe their own parsing -functions (parse_datacite_...), which can be tested more easily. +Datacite being an aggregator, the data is heterogenous and exposes a couple of +problems in content and structure. A few fields have their own parsing +functions (parse_datacite_...), which may help testing. """ import collections @@ -311,6 +311,16 @@ class DataciteImporter(EntityImporter): release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) + # Some records do not use the "dates" field (e.g. micropub), but: + # "attributes.published" or "attributes.publicationYear" + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('published')) + + if not any((release_date, release_month, release_year)): + print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) + # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". @@ -380,6 +390,11 @@ class DataciteImporter(EntityImporter): len(container_name))) container_name = container_name[0] + # Exception: https://www.micropublication.org/, see: !MR24. + if container_id is None and container_name is None: + if publisher and publisher.lower().startswith('micropublication'): + container_name = publisher + # Volume and issue. volume = container.get('volume') issue = container.get('issue') @@ -490,7 +505,7 @@ class DataciteImporter(EntityImporter): if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" - # Detect language. + # Detect language. This is fuzzy and may be removed, if too unreliable. lang = None try: lang = langdetect.detect(text) @@ -719,8 +734,10 @@ class DataciteImporter(EntityImporter): if name: name = clean(name) - if not name: + if not any((name, given_name, surname)): continue + if not name: + name = "{} {}".format(given_name or '', surname or '').strip() if name in name_blacklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -924,6 +941,32 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle +def parse_single_date(value): + """ + Given a single string containing a date in arbitrary format, try to return + tuple (date: datetime.date, month: int, year: int). + """ + if not value: + return None, None, None + if isinstance(value, int): + value = str(value) + parser = dateparser.DateDataParser() + try: + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) + + return None, None, None def parse_datacite_dates(dates): """ @@ -966,7 +1009,7 @@ def parse_datacite_dates(dates): ) def parse_item(item): - result, value, year_only = None, item.get('date', ''), False + result, value, year_only = None, item.get('date', '') or '', False release_date, release_month, release_year = None, None, None for layout, granularity in common_patterns: @@ -981,23 +1024,7 @@ def parse_datacite_dates(dates): if result is None: print('fallback for {}'.format(value), file=sys.stderr) - parser = dateparser.DateDataParser() - try: - # Results in a dict with keys: date_obj, period, locale. - parse_result = parser.get_date_data(value) - - # A datetime object, later we need a date, only. - result = parse_result['date_obj'] - if result is not None: - if parse_result['period'] == 'year': - return None, None, result.year - elif parse_result['period'] == 'month': - return None, result.month, result.year - else: - return result.date(), result.month, result.year - except TypeError as err: - print("{} date parsing failed with: {}".format(value, err), - file=sys.stderr) + release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index bdfd2835..4772bfaa 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -31,6 +31,12 @@ class IngestFileResultImporter(EntityImporter): 'fatcat-ingest-container', 'fatcat-ingest', 'arabesque', + 'mag-corpus', + 'mag', + 'unpaywall-corpus', + 'unpaywall', + 's2-corpus', + 's2', ] if kwargs.get('skip_source_whitelist', False): self.ingest_request_source_whitelist = [] @@ -54,11 +60,14 @@ class IngestFileResultImporter(EntityImporter): self.counts['skip-hit'] += 1 return False source = row['request'].get('ingest_request_source') + if not source: + self.counts['skip-ingest_request_source'] += 1 + return False if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist: self.counts['skip-ingest_request_source'] += 1 return False if source.startswith('arabesque'): - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi'): + if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'): self.counts['skip-arabesque-source'] += 1 return False if source.startswith('savepapernow'): @@ -131,7 +140,12 @@ class IngestFileResultImporter(EntityImporter): if not 'terminal_dt' in terminal: terminal['terminal_dt'] = terminal['dt'] assert len(terminal['terminal_dt']) == 14 - url = make_rel_url(terminal['terminal_url'], self.default_link_rel) + + default_rel = self.default_link_rel + if request.get('link_source') == 'doi': + default_rel = 'publisher' + default_rel = request.get('rel', default_rel) + url = make_rel_url(terminal['terminal_url'], default_rel) if not url: self.counts['skip-url'] += 1 @@ -152,8 +166,8 @@ class IngestFileResultImporter(EntityImporter): release_ids=[release_ident], urls=urls, ) - if fatcat and fatcat.get('edit_extra'): - fe.edit_extra = fatcat['edit_extra'] + if request.get('edit_extra'): + fe.edit_extra = request['edit_extra'] else: fe.edit_extra = dict() if request.get('ingest_request_source'): @@ -229,6 +243,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def want(self, row): source = row['request'].get('ingest_request_source') + if not source: + self.counts['skip-ingest_request_source'] += 1 + return False if not source.startswith('savepapernow'): self.counts['skip-not-savepapernow'] += 1 return False diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py new file mode 100644 index 00000000..4cd22775 --- /dev/null +++ b/python/fatcat_tools/importers/shadow.py @@ -0,0 +1,195 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_openapi_client + +from fatcat_tools.normal import * +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS + + +class ShadowLibraryImporter(EntityImporter): + """ + Importer for shadow library files (matched to releases) + + Input format is JSON with keys: + - shadow + - shadow_corpus (string slug) + - shadow_id (string) + - doi + - pmid + - isbn13 + - file_meta + - sha1hex + - sha256hex + - md5hex + - size_bytes + - mimetype + - cdx (may be null) + - url + - datetime + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + self.default_link_rel = kwargs.get("default_link_rel", "web") + + def want(self, raw_record): + """ + Only want to import records with complete file-level metadata + """ + fm = raw_record['file_meta'] + if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): + self.counts['skip-file-meta-incomplete'] += 1 + return False + if fm['mimetype'] != 'application/pdf': + self.counts['skip-not-pdf'] += 1 + return False + return True + + def parse_record(self, obj): + """ + We do the release lookup in this method. Try DOI, then PMID, last ISBN13. + """ + + shadow_corpus = obj['shadow']['shadow_corpus'] + assert shadow_corpus == shadow_corpus.strip().lower() + doi = clean_doi(obj['shadow'].get('doi')) + pmid = clean_pmid(obj['shadow'].get('pmid')) + isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) + shadow_id = obj['shadow'].get('shadow_id').strip() + assert shadow_id + + extra = { '{}_id'.format(shadow_corpus): shadow_id } + for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + if not ext_id: + continue + extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id + + # lookup release via several idents + re = None + for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + if not ext_id: + continue + try: + re = self.api.lookup_release(**{ext_type: ext_id}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status not in (404, 400): + raise err + re = None + if re: + break + + if not re: + self.counts['skip-release-not-found'] += 1 + return None + + release_ids = [re.ident,] + + # parse single CDX into URLs (if exists) + urls = [] + if obj.get('cdx'): + url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) + if url != None: + urls.append(url) + wayback = "https://web.archive.org/web/{}/{}".format( + obj['cdx']['datetime'], + obj['cdx']['url']) + urls.append(("webarchive", wayback)) + urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + + fe = fatcat_openapi_client.FileEntity( + md5=obj['file_meta']['md5hex'], + sha1=obj['file_meta']['sha1hex'], + sha256=obj['file_meta']['sha256hex'], + size=int(obj['file_meta']['size_bytes']), + mimetype=obj['file_meta']['mimetype'] or None, + release_ids=release_ids, + urls=urls, + extra=dict(shadows=extra), + ) + return fe + + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + + if not existing.extra: + existing.extra = {} + + if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: + # already imported from this shadow library; skip + self.counts['exists'] += 1 + return False + + # check for edit conflicts + if existing.ident in [e.ident for e in self._edits_inflight]: + self.counts['skip-update-inflight'] += 1 + return False + if fe.sha1 in [e.sha1 for e in self._edits_inflight]: + raise Exception("Inflight insert; shouldn't happen") + + # minimum viable "existing" URL cleanup to fix dupes and broken links: + # remove 'None' wayback URLs, and set archive.org rel 'archive' + existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] + for i in range(len(existing.urls)): + u = existing.urls[i] + if u.rel == 'repository' and '://archive.org/download/' in u.url: + existing.urls[i].rel = 'archive' + if u.rel == 'social': + u.rel = 'academicsocial' + + # merge the existing into this one and update + merged_urls = {} + for u in fe.urls + existing.urls: + merged_urls[u.url] = u + existing.urls = list(merged_urls.values()) + if not existing.extra.get('shadows'): + existing.extra['shadows'] = fe.extra['shadows'] + else: + existing.extra['shadows'].update(fe.extra['shadows']) + + # do these "plus ones" because we really want to do these updates when possible + if len(existing.urls) > SANE_MAX_URLS + 1: + self.counts['skip-update-too-many-url'] += 1 + return None + existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + if len(existing.release_ids) > SANE_MAX_RELEASES + 1: + self.counts['skip-update-too-many-releases'] += 1 + return None + existing.mimetype = existing.mimetype or fe.mimetype + existing.size = existing.size or fe.size + existing.md5 = existing.md5 or fe.md5 + existing.sha1 = existing.sha1 or fe.sha1 + existing.sha256 = existing.sha256 or fe.sha256 + edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) + # add sha1 to non-entity edit row, so we can do more aggressive + # group-level de-dupe + edit.sha1 = existing.sha1 + self._edits_inflight.append(edit) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 27a4fb93..22b5154e 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -23,16 +23,16 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) link_source = "arxiv" link_source_id = release.ext_ids.arxiv - elif release.ext_ids.doi: - url = "https://doi.org/{}".format(release.ext_ids.doi) - link_source = "doi" - link_source_id = release.ext_ids.doi elif release.ext_ids.pmcid: # TODO: how to tell if an author manuscript in PMC vs. published? #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) link_source = "pmc" link_source_id = release.ext_ids.pmcid + elif release.ext_ids.doi: + url = "https://doi.org/{}".format(release.ext_ids.doi) + link_source = "doi" + link_source_id = release.ext_ids.doi if not url: return None diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index c8584ccf..b84d5e70 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -102,6 +102,12 @@ class EntityUpdatesWorker(FatcatWorker): # ccdc.cam.ac.uk: crystal structures "10.5517/", ] + self.live_pdf_ingest_doi_prefix_acceptlist = [ + # biorxiv and medrxiv + "10.1101/", + # researchgate + "10.13140/", + ] def want_live_ingest(self, release, ingest_request): """ @@ -115,13 +121,55 @@ class EntityUpdatesWorker(FatcatWorker): link_source = ingest_request.get('ingest_request') ingest_type = ingest_request.get('ingest_type') + doi = ingest_request.get('ext_ids', {}).get('doi') + + is_document = release.release_type in ( + 'article-journal', + 'paper-conference', + 'article', + 'report', + 'chapter', + 'manuscript', + 'review', + 'thesis', + 'letter', + 'editorial', + 'abstract', + 'entry', + 'patent', + 'post', + 'review-book', + ) + is_not_pdf = release.release_type in ( + 'dataset', + 'stub', + 'software', + 'figure', + 'graphic', + ) + + # accept list sets a default "crawl it" despite OA metadata for + # known-OA DOI prefixes + in_acceptlist = False + if doi: + for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: + if doi.startswith(prefix): + in_acceptlist = True if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): es = release_to_elasticsearch(release) - if not es['is_oa']: + # most datacite documents are in IRs and should be crawled + is_datacite_doc = False + if release.extra and ('datacite' in release.extra) and is_document: + is_datacite_doc = True + if not (es['is_oa'] or in_acceptlist or is_datacite_doc): return False - doi = ingest_request.get('ext_ids', {}).get('doi') + # if ingest_type is pdf but release_type is almost certainly not a PDF, + # skip it. This is mostly a datacite thing. + if ingest_type == "pdf" and is_not_pdf: + return False + if ingest_type == "pdf" and doi: for prefix in self.ingest_pdf_doi_prefix_blocklist: if doi.startswith(prefix): diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py index af0fea83..591dda80 100644 --- a/python/fatcat_web/entity_helpers.py +++ b/python/fatcat_web/entity_helpers.py @@ -53,6 +53,10 @@ def enrich_release_entity(entity): entity._es = release_to_elasticsearch(entity, force_bool=False) if entity.container and entity.container.state == "active": entity.container._es = container_to_elasticsearch(entity.container, force_bool=False) + if entity.files: + # remove shadows-only files with no URLs + entity.files = [f for f in entity.files + if not (f.extra and f.extra.get('shadows') and not f.urls)] if entity.filesets: for fs in entity.filesets: fs._total_size = sum([f.size for f in fs.manifest]) diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index 83ecd1c8..961b4759 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -196,8 +196,9 @@ </tbody> </table> {% else %} -<p>There are no known files associated with this release (you could try -<a href="/work/{{ release.work_id }}">other releases for this work?</a>). +<p>There are no accessible files associated with this release. You could check +<a href="/work/{{ release.work_id }}">other releases for this work</a> for an +accessible version. {% endif %} {% endif %} diff --git a/python/tests/files/datacite/datacite_doc_30.json b/python/tests/files/datacite/datacite_doc_30.json new file mode 100644 index 00000000..5f851bbb --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_30.json @@ -0,0 +1,72 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "raw_name": "Celja J Uebel", + "givenName": "Celja J", + "familyName": "Uebel", + "affiliation": [], + "role": "author" + }, + { + "raw_name": "Carolyn M Phillips", + "givenName": "Carolyn M", + "familyName": "Phillips", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "title": "Phase-separated protein dynamics are affected by fluorescent tag choice" + } + ], + "publisher": "microPublication Biology", + "container": {}, + "publicationYear": 2019, + "subjects": [], + "contributors": [], + "dates": null, + "language": null, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": "Biological liquid-liquid phase separation", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "contentUrl": null, + "metadataVersion": 0, + "schemaVersion": null, + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_result_30.json b/python/tests/files/datacite/datacite_result_30.json new file mode 100644 index 00000000..fc2c4dfc --- /dev/null +++ b/python/tests/files/datacite/datacite_result_30.json @@ -0,0 +1,39 @@ +{ + "abstracts": [ + { + "content": "Biological liquid-liquid phase separation", + "lang": "fr", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "index": 0, + "given_name": "Celja J", + "surname": "Uebel", + "raw_name": "Celja J Uebel", + "role": "author" + }, + { + "index": 1, + "given_name": "Carolyn M", + "raw_name": "Carolyn M Phillips", + "surname": "Phillips", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + }, + "container_name": "microPublication Biology" + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Phase-separated protein dynamics are affected by fluorescent tag choice" +} diff --git a/python/tests/files/example_shadow.json b/python/tests/files/example_shadow.json new file mode 100644 index 00000000..3386f481 --- /dev/null +++ b/python/tests/files/example_shadow.json @@ -0,0 +1,10 @@ +{"shadow":{"shadow_corpus":"scimag","shadow_id":"12703034","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","doi":"10.1371/journal.pmed.0020124","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","sha256hex":"b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79","md5hex":"debd8db178fa08a7a0aaec6e42832a8e","size_bytes":206121,"mimetype":"application/pdf"},"cdx":{"url":"https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf","datetime":"20180729135948","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"UNPAYWALL-PDF-CRAWL-2018-07-20180729132538992-15980-16048-wbgrp-svc281/UNPAYWALL-PDF-CRAWL-2018-07-20180729135708800-16009-11693~wbgrp-svc281.us.archive.org~8443.warc.gz","warc_csize":32497,"warc_offset":105265425,"row_created":"2019-08-09T23:25:44.571943+00:00"}} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"51052483","sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","doi":"10.1191/0266355403gh289oa","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","sha256hex":"57ce460db4410b9bfaf500ed652fd29e64d46b40c17e28f1156ba03736edf91b","md5hex":"96133eec3a6c533993213e7bdf446251","size_bytes":164344,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"2476283","sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","doi":"10.1016/0042-207x(62)90512-2","pmid":"54321","isbn13":null},"file_meta":{"sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","sha256hex":"e8d0c607b024ff6ffd58a35f76c454844b70ad19fe3f78a573af1ae53f53ad9d","md5hex":"b53318522b9f35a42b7e53f150fe70b2","size_bytes":116735,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"8760871","sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","doi":"10.1016/s0042-207x(79)80945-8","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","sha256hex":"8a69b4a6dff98682ad43e7d4139221c1557c1bd202b615490af8a2c7dcbb71d2","md5hex":"29e1cfac8ecfbc8be57a1ec8b465c4be","size_bytes":138218,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"11473618","sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","doi":"10.1038/ng.2339","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","sha256hex":"a72517e8e72d78bc07a6ef7ff3a6d1d3e04325df986cb8f1bbb4e809f7a9dbdd","md5hex":"9cb8a6e056c9cc740d3bed0c50cd53dc","size_bytes":80992,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"47301218","sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","doi":"10.2307/23406551","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","sha256hex":"315f1d39a00ccf256fa15d92a14869dbda48d31500989aaacb11368f906a5827","md5hex":"8141b42ec3bb41fa87099633a1b61d93","size_bytes":305236,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"30603850","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","doi":"10.1109/spire.1998.712983","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","sha256hex":"777e2c472e9d2fec3bbd26bad788562cf1e08e5850315c25cfb6e46d38e7e4af","md5hex":"3a3c92fabaf6cf437bb596d9e9255ff6","size_bytes":113768,"mimetype":"application/pdf"},"cdx":{"url":"http://proteomics.bioprojects.org/pavel/papers/SST_versus_EST_in_gene_recognition..pdf","datetime":"20081121222143","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"1227992340180_31-c/1227992509265_9.arc.gz","warc_csize":61212,"warc_offset":62956683,"row_created":"2020-01-07T02:06:33.965383+00:00"}} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"9311918","sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","doi":"10.1111/j.1439-0507.2008.01572.x","pmid":"18721331","isbn13":null},"file_meta":{"sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","sha256hex":"713758ce0417f604c0a4b0bf5b5eea571a9b08ca4cc81a98d602c43f42abfe37","md5hex":"0df123e6305c617ffd38ebef90b1e318","size_bytes":178664,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"7757772","sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","doi":"10.1007/s10464-011-9424-3","pmid":"21287262","isbn13":null},"file_meta":{"sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","sha256hex":"ee1bce27134ae55b3d67f9b31f66571e41ac496fc3fb526dec2d53513b8f6deb","md5hex":"e72c5cf3d61635821e78ca0306c98887","size_bytes":337857,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"74272862","sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","doi":"10.1002/slct.201802783","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","sha256hex":"f277eefc7b1466df814a7a892ab8e2e7f08db1faae0bf73b893211e5f5b37193","md5hex":"27534b8494f54ba5de47c16fb2590b04","size_bytes":1372272,"mimetype":"application/pdf"},"cdx":null} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 669a6984..15650375 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -287,10 +287,9 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(30): + for i in range(31): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) - print('testing mapping from {} => {}'.format(src, dst)) with open(src, 'r') as f: re = datacite_importer.parse_record(json.load(f)) result = entity_to_dict(re) diff --git a/python/tests/import_shadow.py b/python/tests/import_shadow.py new file mode 100644 index 00000000..70a918d2 --- /dev/null +++ b/python/tests/import_shadow.py @@ -0,0 +1,61 @@ + +import json +import pytest +from fatcat_tools.importers import ShadowLibraryImporter, JsonLinePusher +from fixtures import api + + +@pytest.fixture(scope="function") +def shadow_importer(api): + yield ShadowLibraryImporter(api) + +# TODO: use API to check that entities actually created... +def test_shadow_importer_basic(shadow_importer): + with open('tests/files/example_shadow.json', 'r') as f: + JsonLinePusher(shadow_importer, f).run() + +def test_shadow_importer(shadow_importer): + last_index = shadow_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/example_shadow.json', 'r') as f: + shadow_importer.bezerk_mode = True + counts = JsonLinePusher(shadow_importer, f).run() + assert counts['insert'] == 2 + assert counts['exists'] == 0 + assert counts['skip'] == 8 + + # fetch most recent editgroup + change = shadow_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "shadow library" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.ShadowLibraryImporter" in eg.extra['agent'] + + # re-insert; should skip + with open('tests/files/example_shadow.json', 'r') as f: + shadow_importer.reset() + shadow_importer.bezerk_mode = False + counts = JsonLinePusher(shadow_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 2 + assert counts['skip'] == 8 + +def test_shadow_dict_parse(shadow_importer): + with open('tests/files/example_shadow.json', 'r') as f: + raw = json.loads(f.readline()) + f = shadow_importer.parse_record(raw) + + assert f.sha1 == "0000002922264275f11cca7b1c3fb662070d0dd7" + assert f.md5 == "debd8db178fa08a7a0aaec6e42832a8e" + assert f.sha256 == "b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79" + assert f.mimetype == "application/pdf" + assert f.size == 206121 + assert len(f.urls) == 2 + for u in f.urls: + if u.rel == "publisher": + assert u.url.startswith("https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf") + if u.rel == "webarchive": + assert u.url.startswith("https://web.archive.org/") + assert "20180729135948" in u.url + assert len(f.release_ids) == 1 + |