diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 9 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 75 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 25 | ||||
-rw-r--r-- | python/fatcat_tools/importers/shadow.py | 195 |
5 files changed, 277 insertions, 28 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index d936605f..10557ef8 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat from .ingest import IngestFileResultImporter, SavePaperNowFileImporter +from .shadow import ShadowLibraryImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 1ffbd6e7..a84ce90f 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -194,6 +194,8 @@ DOMAIN_REL_MAP = { "www.scielo.cl": "repository", "www.scielo.org.mx": "repository", "zenodo.org": "repository", + "www.biorxiv.org": "repository", + "www.medrxiv.org": "repository", "citeseerx.ist.psu.edu": "aggregator", "publisher-connector.core.ac.uk": "aggregator", @@ -220,6 +222,13 @@ DOMAIN_REL_MAP = { "www.nature.com": "publisher", "www.pnas.org": "publisher", "www.tandfonline.com": "publisher", + "www.frontiersin.org": "publisher", + "www.degruyter.com": "publisher", + "www.mdpi.com": "publisher", + "www.ahajournals.org": "publisher", + "ehp.niehs.nih.gov": "publisher", + "journals.tsu.ru": "publisher", + "www.cogentoa.com": "publisher", "www.researchgate.net": "academicsocial", "academia.edu": "academicsocial", diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 2f77481a..4e382348 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -1,11 +1,11 @@ """ Prototype importer for datacite.org data. -Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8. +Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51 -Datacite being an aggregator, the data is varied and exposes a couple of -problems in content and structure. A few fields habe their own parsing -functions (parse_datacite_...), which can be tested more easily. +Datacite being an aggregator, the data is heterogenous and exposes a couple of +problems in content and structure. A few fields have their own parsing +functions (parse_datacite_...), which may help testing. """ import collections @@ -311,6 +311,16 @@ class DataciteImporter(EntityImporter): release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) + # Some records do not use the "dates" field (e.g. micropub), but: + # "attributes.published" or "attributes.publicationYear" + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('published')) + + if not any((release_date, release_month, release_year)): + print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) + # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". @@ -380,6 +390,11 @@ class DataciteImporter(EntityImporter): len(container_name))) container_name = container_name[0] + # Exception: https://www.micropublication.org/, see: !MR24. + if container_id is None and container_name is None: + if publisher and publisher.lower().startswith('micropublication'): + container_name = publisher + # Volume and issue. volume = container.get('volume') issue = container.get('issue') @@ -490,7 +505,7 @@ class DataciteImporter(EntityImporter): if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" - # Detect language. + # Detect language. This is fuzzy and may be removed, if too unreliable. lang = None try: lang = langdetect.detect(text) @@ -719,8 +734,10 @@ class DataciteImporter(EntityImporter): if name: name = clean(name) - if not name: + if not any((name, given_name, surname)): continue + if not name: + name = "{} {}".format(given_name or '', surname or '').strip() if name in name_blacklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -924,6 +941,32 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle +def parse_single_date(value): + """ + Given a single string containing a date in arbitrary format, try to return + tuple (date: datetime.date, month: int, year: int). + """ + if not value: + return None, None, None + if isinstance(value, int): + value = str(value) + parser = dateparser.DateDataParser() + try: + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) + + return None, None, None def parse_datacite_dates(dates): """ @@ -966,7 +1009,7 @@ def parse_datacite_dates(dates): ) def parse_item(item): - result, value, year_only = None, item.get('date', ''), False + result, value, year_only = None, item.get('date', '') or '', False release_date, release_month, release_year = None, None, None for layout, granularity in common_patterns: @@ -981,23 +1024,7 @@ def parse_datacite_dates(dates): if result is None: print('fallback for {}'.format(value), file=sys.stderr) - parser = dateparser.DateDataParser() - try: - # Results in a dict with keys: date_obj, period, locale. - parse_result = parser.get_date_data(value) - - # A datetime object, later we need a date, only. - result = parse_result['date_obj'] - if result is not None: - if parse_result['period'] == 'year': - return None, None, result.year - elif parse_result['period'] == 'month': - return None, result.month, result.year - else: - return result.date(), result.month, result.year - except TypeError as err: - print("{} date parsing failed with: {}".format(value, err), - file=sys.stderr) + release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index bdfd2835..4772bfaa 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -31,6 +31,12 @@ class IngestFileResultImporter(EntityImporter): 'fatcat-ingest-container', 'fatcat-ingest', 'arabesque', + 'mag-corpus', + 'mag', + 'unpaywall-corpus', + 'unpaywall', + 's2-corpus', + 's2', ] if kwargs.get('skip_source_whitelist', False): self.ingest_request_source_whitelist = [] @@ -54,11 +60,14 @@ class IngestFileResultImporter(EntityImporter): self.counts['skip-hit'] += 1 return False source = row['request'].get('ingest_request_source') + if not source: + self.counts['skip-ingest_request_source'] += 1 + return False if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist: self.counts['skip-ingest_request_source'] += 1 return False if source.startswith('arabesque'): - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi'): + if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'): self.counts['skip-arabesque-source'] += 1 return False if source.startswith('savepapernow'): @@ -131,7 +140,12 @@ class IngestFileResultImporter(EntityImporter): if not 'terminal_dt' in terminal: terminal['terminal_dt'] = terminal['dt'] assert len(terminal['terminal_dt']) == 14 - url = make_rel_url(terminal['terminal_url'], self.default_link_rel) + + default_rel = self.default_link_rel + if request.get('link_source') == 'doi': + default_rel = 'publisher' + default_rel = request.get('rel', default_rel) + url = make_rel_url(terminal['terminal_url'], default_rel) if not url: self.counts['skip-url'] += 1 @@ -152,8 +166,8 @@ class IngestFileResultImporter(EntityImporter): release_ids=[release_ident], urls=urls, ) - if fatcat and fatcat.get('edit_extra'): - fe.edit_extra = fatcat['edit_extra'] + if request.get('edit_extra'): + fe.edit_extra = request['edit_extra'] else: fe.edit_extra = dict() if request.get('ingest_request_source'): @@ -229,6 +243,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def want(self, row): source = row['request'].get('ingest_request_source') + if not source: + self.counts['skip-ingest_request_source'] += 1 + return False if not source.startswith('savepapernow'): self.counts['skip-not-savepapernow'] += 1 return False diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py new file mode 100644 index 00000000..4cd22775 --- /dev/null +++ b/python/fatcat_tools/importers/shadow.py @@ -0,0 +1,195 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_openapi_client + +from fatcat_tools.normal import * +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS + + +class ShadowLibraryImporter(EntityImporter): + """ + Importer for shadow library files (matched to releases) + + Input format is JSON with keys: + - shadow + - shadow_corpus (string slug) + - shadow_id (string) + - doi + - pmid + - isbn13 + - file_meta + - sha1hex + - sha256hex + - md5hex + - size_bytes + - mimetype + - cdx (may be null) + - url + - datetime + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + self.default_link_rel = kwargs.get("default_link_rel", "web") + + def want(self, raw_record): + """ + Only want to import records with complete file-level metadata + """ + fm = raw_record['file_meta'] + if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): + self.counts['skip-file-meta-incomplete'] += 1 + return False + if fm['mimetype'] != 'application/pdf': + self.counts['skip-not-pdf'] += 1 + return False + return True + + def parse_record(self, obj): + """ + We do the release lookup in this method. Try DOI, then PMID, last ISBN13. + """ + + shadow_corpus = obj['shadow']['shadow_corpus'] + assert shadow_corpus == shadow_corpus.strip().lower() + doi = clean_doi(obj['shadow'].get('doi')) + pmid = clean_pmid(obj['shadow'].get('pmid')) + isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) + shadow_id = obj['shadow'].get('shadow_id').strip() + assert shadow_id + + extra = { '{}_id'.format(shadow_corpus): shadow_id } + for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + if not ext_id: + continue + extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id + + # lookup release via several idents + re = None + for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + if not ext_id: + continue + try: + re = self.api.lookup_release(**{ext_type: ext_id}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status not in (404, 400): + raise err + re = None + if re: + break + + if not re: + self.counts['skip-release-not-found'] += 1 + return None + + release_ids = [re.ident,] + + # parse single CDX into URLs (if exists) + urls = [] + if obj.get('cdx'): + url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) + if url != None: + urls.append(url) + wayback = "https://web.archive.org/web/{}/{}".format( + obj['cdx']['datetime'], + obj['cdx']['url']) + urls.append(("webarchive", wayback)) + urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + + fe = fatcat_openapi_client.FileEntity( + md5=obj['file_meta']['md5hex'], + sha1=obj['file_meta']['sha1hex'], + sha256=obj['file_meta']['sha256hex'], + size=int(obj['file_meta']['size_bytes']), + mimetype=obj['file_meta']['mimetype'] or None, + release_ids=release_ids, + urls=urls, + extra=dict(shadows=extra), + ) + return fe + + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + + if not existing.extra: + existing.extra = {} + + if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: + # already imported from this shadow library; skip + self.counts['exists'] += 1 + return False + + # check for edit conflicts + if existing.ident in [e.ident for e in self._edits_inflight]: + self.counts['skip-update-inflight'] += 1 + return False + if fe.sha1 in [e.sha1 for e in self._edits_inflight]: + raise Exception("Inflight insert; shouldn't happen") + + # minimum viable "existing" URL cleanup to fix dupes and broken links: + # remove 'None' wayback URLs, and set archive.org rel 'archive' + existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] + for i in range(len(existing.urls)): + u = existing.urls[i] + if u.rel == 'repository' and '://archive.org/download/' in u.url: + existing.urls[i].rel = 'archive' + if u.rel == 'social': + u.rel = 'academicsocial' + + # merge the existing into this one and update + merged_urls = {} + for u in fe.urls + existing.urls: + merged_urls[u.url] = u + existing.urls = list(merged_urls.values()) + if not existing.extra.get('shadows'): + existing.extra['shadows'] = fe.extra['shadows'] + else: + existing.extra['shadows'].update(fe.extra['shadows']) + + # do these "plus ones" because we really want to do these updates when possible + if len(existing.urls) > SANE_MAX_URLS + 1: + self.counts['skip-update-too-many-url'] += 1 + return None + existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + if len(existing.release_ids) > SANE_MAX_RELEASES + 1: + self.counts['skip-update-too-many-releases'] += 1 + return None + existing.mimetype = existing.mimetype or fe.mimetype + existing.size = existing.size or fe.size + existing.md5 = existing.md5 or fe.md5 + existing.sha1 = existing.sha1 or fe.sha1 + existing.sha256 = existing.sha256 or fe.sha256 + edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) + # add sha1 to non-entity edit row, so we can do more aggressive + # group-level de-dupe + edit.sha1 = existing.sha1 + self._edits_inflight.append(edit) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + |