diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 9 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 14 | ||||
-rw-r--r-- | python/fatcat_tools/importers/shadow.py | 195 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/__init__.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 242 | ||||
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 40 |
7 files changed, 453 insertions, 50 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 03c7cbcc..c26446fd 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat from .ingest import IngestFileResultImporter, SavePaperNowFileImporter +from .shadow import ShadowLibraryImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 5f5c46b8..c000ad62 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -194,6 +194,8 @@ DOMAIN_REL_MAP = { "www.scielo.cl": "repository", "www.scielo.org.mx": "repository", "zenodo.org": "repository", + "www.biorxiv.org": "repository", + "www.medrxiv.org": "repository", "citeseerx.ist.psu.edu": "aggregator", "publisher-connector.core.ac.uk": "aggregator", @@ -220,6 +222,13 @@ DOMAIN_REL_MAP = { "www.nature.com": "publisher", "www.pnas.org": "publisher", "www.tandfonline.com": "publisher", + "www.frontiersin.org": "publisher", + "www.degruyter.com": "publisher", + "www.mdpi.com": "publisher", + "www.ahajournals.org": "publisher", + "ehp.niehs.nih.gov": "publisher", + "journals.tsu.ru": "publisher", + "www.cogentoa.com": "publisher", "www.researchgate.net": "academicsocial", "academia.edu": "academicsocial", diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index fdaba176..4772bfaa 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -32,8 +32,11 @@ class IngestFileResultImporter(EntityImporter): 'fatcat-ingest', 'arabesque', 'mag-corpus', + 'mag', 'unpaywall-corpus', + 'unpaywall', 's2-corpus', + 's2', ] if kwargs.get('skip_source_whitelist', False): self.ingest_request_source_whitelist = [] @@ -137,7 +140,12 @@ class IngestFileResultImporter(EntityImporter): if not 'terminal_dt' in terminal: terminal['terminal_dt'] = terminal['dt'] assert len(terminal['terminal_dt']) == 14 - url = make_rel_url(terminal['terminal_url'], self.default_link_rel) + + default_rel = self.default_link_rel + if request.get('link_source') == 'doi': + default_rel = 'publisher' + default_rel = request.get('rel', default_rel) + url = make_rel_url(terminal['terminal_url'], default_rel) if not url: self.counts['skip-url'] += 1 @@ -158,8 +166,8 @@ class IngestFileResultImporter(EntityImporter): release_ids=[release_ident], urls=urls, ) - if fatcat and fatcat.get('edit_extra'): - fe.edit_extra = fatcat['edit_extra'] + if request.get('edit_extra'): + fe.edit_extra = request['edit_extra'] else: fe.edit_extra = dict() if request.get('ingest_request_source'): diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py new file mode 100644 index 00000000..4cd22775 --- /dev/null +++ b/python/fatcat_tools/importers/shadow.py @@ -0,0 +1,195 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_openapi_client + +from fatcat_tools.normal import * +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS + + +class ShadowLibraryImporter(EntityImporter): + """ + Importer for shadow library files (matched to releases) + + Input format is JSON with keys: + - shadow + - shadow_corpus (string slug) + - shadow_id (string) + - doi + - pmid + - isbn13 + - file_meta + - sha1hex + - sha256hex + - md5hex + - size_bytes + - mimetype + - cdx (may be null) + - url + - datetime + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + self.default_link_rel = kwargs.get("default_link_rel", "web") + + def want(self, raw_record): + """ + Only want to import records with complete file-level metadata + """ + fm = raw_record['file_meta'] + if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): + self.counts['skip-file-meta-incomplete'] += 1 + return False + if fm['mimetype'] != 'application/pdf': + self.counts['skip-not-pdf'] += 1 + return False + return True + + def parse_record(self, obj): + """ + We do the release lookup in this method. Try DOI, then PMID, last ISBN13. + """ + + shadow_corpus = obj['shadow']['shadow_corpus'] + assert shadow_corpus == shadow_corpus.strip().lower() + doi = clean_doi(obj['shadow'].get('doi')) + pmid = clean_pmid(obj['shadow'].get('pmid')) + isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) + shadow_id = obj['shadow'].get('shadow_id').strip() + assert shadow_id + + extra = { '{}_id'.format(shadow_corpus): shadow_id } + for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + if not ext_id: + continue + extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id + + # lookup release via several idents + re = None + for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + if not ext_id: + continue + try: + re = self.api.lookup_release(**{ext_type: ext_id}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status not in (404, 400): + raise err + re = None + if re: + break + + if not re: + self.counts['skip-release-not-found'] += 1 + return None + + release_ids = [re.ident,] + + # parse single CDX into URLs (if exists) + urls = [] + if obj.get('cdx'): + url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) + if url != None: + urls.append(url) + wayback = "https://web.archive.org/web/{}/{}".format( + obj['cdx']['datetime'], + obj['cdx']['url']) + urls.append(("webarchive", wayback)) + urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + + fe = fatcat_openapi_client.FileEntity( + md5=obj['file_meta']['md5hex'], + sha1=obj['file_meta']['sha1hex'], + sha256=obj['file_meta']['sha256hex'], + size=int(obj['file_meta']['size_bytes']), + mimetype=obj['file_meta']['mimetype'] or None, + release_ids=release_ids, + urls=urls, + extra=dict(shadows=extra), + ) + return fe + + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + + if not existing.extra: + existing.extra = {} + + if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: + # already imported from this shadow library; skip + self.counts['exists'] += 1 + return False + + # check for edit conflicts + if existing.ident in [e.ident for e in self._edits_inflight]: + self.counts['skip-update-inflight'] += 1 + return False + if fe.sha1 in [e.sha1 for e in self._edits_inflight]: + raise Exception("Inflight insert; shouldn't happen") + + # minimum viable "existing" URL cleanup to fix dupes and broken links: + # remove 'None' wayback URLs, and set archive.org rel 'archive' + existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] + for i in range(len(existing.urls)): + u = existing.urls[i] + if u.rel == 'repository' and '://archive.org/download/' in u.url: + existing.urls[i].rel = 'archive' + if u.rel == 'social': + u.rel = 'academicsocial' + + # merge the existing into this one and update + merged_urls = {} + for u in fe.urls + existing.urls: + merged_urls[u.url] = u + existing.urls = list(merged_urls.values()) + if not existing.extra.get('shadows'): + existing.extra['shadows'] = fe.extra['shadows'] + else: + existing.extra['shadows'].update(fe.extra['shadows']) + + # do these "plus ones" because we really want to do these updates when possible + if len(existing.urls) > SANE_MAX_URLS + 1: + self.counts['skip-update-too-many-url'] += 1 + return None + existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + if len(existing.release_ids) > SANE_MAX_RELEASES + 1: + self.counts['skip-update-too-many-releases'] += 1 + return None + existing.mimetype = existing.mimetype or fe.mimetype + existing.size = existing.size or fe.size + existing.md5 = existing.md5 or fe.md5 + existing.sha1 = existing.sha1 or fe.sha1 + existing.sha256 = existing.sha256 or fe.sha256 + edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) + # add sha1 to non-entity edit row, so we can do more aggressive + # group-level de-dupe + edit.sha1 = existing.sha1 + self._edits_inflight.append(edit) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py index 6a4b1bba..3f4700ff 100644 --- a/python/fatcat_tools/transforms/__init__.py +++ b/python/fatcat_tools/transforms/__init__.py @@ -1,5 +1,5 @@ from .entities import entity_to_dict, entity_from_json, entity_from_dict -from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch +from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch, file_to_elasticsearch from .csl import release_to_csl, citeproc_csl from .ingest import release_ingest_request diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 3a53db4d..87e054ec 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,6 +1,6 @@ - import collections +import tldextract from fatcat_openapi_client import ApiClient @@ -20,6 +20,7 @@ def test_check_kbart(): assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True + def release_to_elasticsearch(entity, force_bool=True): """ Converts from an entity model/schema to elasticsearch oriented schema. @@ -50,6 +51,10 @@ def release_to_elasticsearch(entity, force_bool=True): release_stage = release.release_stage, withdrawn_status = release.withdrawn_status, language = release.language, + volume = release.volume, + issue = release.issue, + pages = release.pages, + number = release.number, license = release.license_slug, doi = release.ext_ids.doi, pmid = release.ext_ids.pmid, @@ -72,7 +77,7 @@ def release_to_elasticsearch(entity, force_bool=True): in_dweb = False in_ia = False in_ia_sim = False - in_shadow = False + in_shadows = False release_year = release.release_year if release.release_date: @@ -85,11 +90,15 @@ def release_to_elasticsearch(entity, force_bool=True): t['any_abstract'] = len(release.abstracts or []) > 0 t['ref_count'] = len(release.refs or []) - t['ref_linked_count'] = 0 - if release.refs: - t['ref_linked_count'] = len([1 for ref in release.refs if ref.target_release_id]) + ref_release_ids = [] + for r in (release.refs or []): + if r.target_release_id: + ref_release_ids.append(r.target_release_id) + t['ref_release_ids'] = ref_release_ids + t['ref_linked_count'] = len(ref_release_ids) t['contrib_count'] = len(release.contribs or []) contrib_names = [] + contrib_affiliations = [] creator_ids = [] for c in (release.contribs or []): if c.raw_name: @@ -98,8 +107,14 @@ def release_to_elasticsearch(entity, force_bool=True): contrib_names.append(c.surname) if c.creator_id: creator_ids.append(c.creator_id) + if c.raw_affiliation: + contrib_affiliations.append(c.raw_affiliation) t['contrib_names'] = contrib_names t['creator_ids'] = creator_ids + t['affiliations'] = contrib_affiliations + + # TODO: mapping... probably by lookup? + t['affiliation_rors'] = None container = release.container if container: @@ -134,14 +149,19 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('road'): if c_extra['road'].get('as_of'): is_oa = True - if c_extra.get('ezb'): - if c_extra['ezb'].get('color') == 'green': - is_oa = True if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True - else: + if c_extra.get('country'): + t['country_code'] = c_extra['country'] + t['country_code_upper'] = c_extra['country'].upper() + + # fall back to release-level container metadata if container not linked or + # missing context + if not t.get('publisher'): t['publisher'] = release.publisher + if not t.get('container_name') and release.extra: + t['container_name'] = release.extra.get('container_name') if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')): in_jstor = True @@ -187,6 +207,8 @@ def release_to_elasticsearch(entity, force_bool=True): # TODO: more/better checks here, particularly strict *not* OA licenses if release.license_slug.startswith("CC-"): is_oa = True + if release.license_slug.startswith("ARXIV-"): + is_oa = True extra = release.extra or dict() if extra: @@ -203,6 +225,47 @@ def release_to_elasticsearch(entity, force_bool=True): if extra['crossref'].get('archive'): # all crossref archives are KBART, I believe in_kbart = True + # backwards compatible subtitle fetching + if not t['subtitle'] and extra.get('subtitle'): + if type(extra['subtitle']) == list: + t['subtitle'] = extra['subtitle'][0] + else: + t['subtitle'] = extra['subtitle'] + + t['first_page'] = None + if release.pages: + first = release.pages.split('-')[0] + first = first.replace('p', '') + if first.isdigit(): + t['first_page'] = first + # TODO: non-numerical first pages + + t['ia_microfilm_url'] = None + if in_ia_sim: + # TODO: determine URL somehow? I think this is in flux. Will probably + # need extra metadata in the container extra field. + # special case as a demo for now. + if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \ + and release.release_year in (2011, 2013) \ + and release.issue \ + and release.issue.isdigit() \ + and t['first_page']: + t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format( + release.release_year, + int(release.issue) - 1, + t['first_page'], + ) + + t['doi_registrar'] = None + if extra and t['doi']: + for k in ('crossref', 'datacite', 'jalc'): + if k in extra: + t['doi_registrar'] = k + if not 'doi_registrar' in t: + t['doi_registrar'] = 'crossref' + + if t['doi']: + t['doi_prefix'] = t['doi'].split('/')[0] if is_longtail_oa: is_oa = True @@ -215,6 +278,7 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_jstor'] = bool(in_jstor) t['in_web'] = bool(in_web) t['in_dweb'] = bool(in_dweb) + t['in_shadows'] = bool(in_shadows) else: t['is_oa'] = is_oa t['is_longtail_oa'] = is_longtail_oa @@ -223,11 +287,23 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_jstor'] = in_jstor t['in_web'] = in_web t['in_dweb'] = in_dweb + t['in_shadows'] = in_shadows t['in_ia'] = bool(in_ia) t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) + + if in_ia or t.get('pmcid') or t.get('arxiv_id'): + t['preservation'] = 'bright' + elif in_kbart or in_jstor: + t['preservation'] = 'dark' + elif in_shadows: + t['preservation'] = 'shadows_only' + else: + t['preservation'] = 'none' + return t + def container_to_elasticsearch(entity, force_bool=True): """ Converts from an entity model/schema to elasticsearch oriented schema. @@ -257,23 +333,27 @@ def container_to_elasticsearch(entity, force_bool=True): wikidata_qid = entity.wikidata_qid, ) - # TODO: region, discipline - # TODO: single primary language? if not entity.extra: entity.extra = dict() - for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'): + for key in ('country', 'languages', 'mimetypes', 'original_name', + 'first_year', 'last_year', 'aliases', 'abbrev', 'region', + 'discipline'): if entity.extra.get(key): t[key] = entity.extra[key] + if 'country' in t: + t['country_code'] = t.pop('country') + + t['issns'] = [] + if entity.issnl: + t['issns'].append(entity.issnl) + for key in ('issnp', 'issne'): + if entity.extra.get(key): + t['issns'].append(entity.extra[key]) + in_doaj = None in_road = None - # TODO: not currently implemented - in_doi = None - # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid" - #in_doaj_works = None - in_sherpa_romeo = None is_oa = None - # TODO: not actually set/stored anywhere? is_longtail_oa = None any_kbart = None any_jstor = None @@ -286,17 +366,15 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('road'): if extra['road'].get('as_of'): in_road = True - if extra.get('ezb'): - if extra['ezb'].get('color') == 'green': - is_oa = True if extra.get('szczepanski'): if extra['szczepanski'].get('as_of'): is_oa = True if extra.get('default_license'): if extra['default_license'].startswith('CC-'): is_oa = True + t['sherpa_romeo_color'] = None if extra.get('sherpa_romeo'): - in_sherpa_romeo = True + t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color') if extra['sherpa_romeo'].get('color') == 'white': is_oa = False if extra.get('kbart'): @@ -306,54 +384,128 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('ia'): if extra['ia'].get('sim'): any_ia_sim = True + if extra['ia'].get('longtail_oa'): + is_longtail_oa = True t['is_superceded'] = bool(extra.get('superceded')) t['in_doaj'] = bool(in_doaj) t['in_road'] = bool(in_road) - t['in_sherpa_romeo'] = bool(in_sherpa_romeo) t['any_kbart'] = bool(any_kbart) - t['is_longtail_oa'] = bool(is_longtail_oa) if force_bool: - t['in_doi'] = bool(in_doi) - t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa) + t['is_oa'] = bool(in_doaj or in_road or is_oa) + t['is_longtail_oa'] = bool(is_longtail_oa) t['any_jstor'] = bool(any_jstor) t['any_ia_sim'] = bool(any_ia_sim) else: - t['in_doi'] = in_doi - t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa + t['is_oa'] = in_doaj or in_road or is_oa + t['is_longtail_oa'] = is_longtail_oa t['any_jstor'] = any_jstor t['any_ia_sim'] = any_ia_sim return t +def _type_of_edit(edit): + if edit.revision == None and edit.redirect_ident == None: + return 'delete' + elif edit.redirect_ident: + # redirect + return 'update' + elif edit.prev_revision == None and edit.redirect_ident == None and edit.revision: + return 'create' + else: + return 'update' + + def changelog_to_elasticsearch(entity): + """ + Note that this importer requires expanded fill info to work. Calling code + may need to re-fetch editgroup from API to get the 'editor' field. Some of + the old kafka feed content doesn't includes editor in particular. + """ editgroup = entity.editgroup t = dict( index=entity.index, editgroup_id=entity.editgroup_id, - timestamp=entity.timestamp, + timestamp=entity.timestamp.isoformat(), editor_id=editgroup.editor_id, + username=editgroup.editor.username, + is_bot=editgroup.editor.is_bot, + is_admin=editgroup.editor.is_admin, ) extra = editgroup.extra or dict() if extra.get('agent'): t['agent'] = extra['agent'] - t['containers'] = len(editgroup.edits.containers) - t['creators'] = len(editgroup.edits.containers) - t['files'] = len(editgroup.edits.containers) - t['filesets'] = len(editgroup.edits.containers) - t['webcaptures'] = len(editgroup.edits.containers) - t['releases'] = len(editgroup.edits.containers) - t['works'] = len(editgroup.edits.containers) - - # TODO: parse and pull out counts - #created = 0 - #updated = 0 - #deleted = 0 - #t['created'] = created - #t['updated'] = updated - #t['deleted'] = deleted - #t['total'] = created + updated + deleted + containers = [_type_of_edit(e) for e in editgroup.edits.containers] + creators = [_type_of_edit(e) for e in editgroup.edits.creators] + files = [_type_of_edit(e) for e in editgroup.edits.files] + filesets = [_type_of_edit(e) for e in editgroup.edits.filesets] + webcaptures = [_type_of_edit(e) for e in editgroup.edits.webcaptures] + releases = [_type_of_edit(e) for e in editgroup.edits.releases] + works = [_type_of_edit(e) for e in editgroup.edits.works] + + t['containers'] = len(containers) + t['new_containers'] = len([e for e in containers if e == 'create']) + t['creators'] = len(creators) + t['new_creators'] = len([e for e in creators if e == 'create']) + t['files'] = len(files) + t['new_files'] = len([e for e in files if e == 'create']) + t['filesets'] = len(filesets) + t['new_filesets'] = len([e for e in filesets if e == 'create']) + t['webcaptures'] = len(webcaptures) + t['new_webcaptures'] = len([e for e in webcaptures if e == 'create']) + t['releases'] = len(releases) + t['new_releases'] = len([e for e in releases if e == 'create']) + t['works'] = len(works) + t['new_works'] = len([e for e in works if e == 'create']) + + all_edits = containers + creators + files + filesets + webcaptures + releases + works + + t['created'] = len([e for e in all_edits if e == 'create']) + t['updated'] = len([e for e in all_edits if e == 'update']) + t['deleted'] = len([e for e in all_edits if e == 'delete']) + t['total'] = len(all_edits) + return t + + +def file_to_elasticsearch(entity): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + Raises exception on error (never returns None) + """ + + if entity.state in ('redirect', 'deleted'): + return dict( + ident = entity.ident, + state = entity.state, + ) + elif entity.state != 'active': + raise ValueError("Unhandled entity state: {}".format(entity.state)) + + # First, the easy ones (direct copy) + t = dict( + ident = entity.ident, + state = entity.state, + revision = entity.revision, + release_ids = entity.release_ids, + release_count = len(entity.release_ids), + mimetype = entity.mimetype, + size_bytes = entity.size, + sha1 = entity.sha1, + sha256 = entity.sha256, + md5 = entity.md5, + ) + + parsed_urls = [tldextract.extract(u.url) for u in entity.urls] + t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls])) + t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) + t['rels'] = list(set([u.rel for u in entity.urls])) + + t['in_ia'] = bool('archive.org' in t['domains']) + t['in_ia_petabox'] = bool('archive.org' in t['hosts']) + return t diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 7a9a585d..b84d5e70 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -105,6 +105,8 @@ class EntityUpdatesWorker(FatcatWorker): self.live_pdf_ingest_doi_prefix_acceptlist = [ # biorxiv and medrxiv "10.1101/", + # researchgate + "10.13140/", ] def want_live_ingest(self, release, ingest_request): @@ -121,6 +123,33 @@ class EntityUpdatesWorker(FatcatWorker): ingest_type = ingest_request.get('ingest_type') doi = ingest_request.get('ext_ids', {}).get('doi') + is_document = release.release_type in ( + 'article-journal', + 'paper-conference', + 'article', + 'report', + 'chapter', + 'manuscript', + 'review', + 'thesis', + 'letter', + 'editorial', + 'abstract', + 'entry', + 'patent', + 'post', + 'review-book', + ) + is_not_pdf = release.release_type in ( + 'dataset', + 'stub', + 'software', + 'figure', + 'graphic', + ) + + # accept list sets a default "crawl it" despite OA metadata for + # known-OA DOI prefixes in_acceptlist = False if doi: for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: @@ -129,9 +158,18 @@ class EntityUpdatesWorker(FatcatWorker): if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): es = release_to_elasticsearch(release) - if not es['is_oa'] and not in_acceptlist: + # most datacite documents are in IRs and should be crawled + is_datacite_doc = False + if release.extra and ('datacite' in release.extra) and is_document: + is_datacite_doc = True + if not (es['is_oa'] or in_acceptlist or is_datacite_doc): return False + # if ingest_type is pdf but release_type is almost certainly not a PDF, + # skip it. This is mostly a datacite thing. + if ingest_type == "pdf" and is_not_pdf: + return False + if ingest_type == "pdf" and doi: for prefix in self.ingest_pdf_doi_prefix_blocklist: if doi.startswith(prefix): |