diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 11 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 6 |
3 files changed, 15 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2446cdbf..fcbe9ad2 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -164,7 +164,7 @@ class EntityImporter: self.es_client = kwargs.get('es_client') if not self.es_client: - self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") + self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120) self._issnl_id_map = dict() self._orcid_id_map = dict() diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 5cbc95d0..daecd765 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -323,6 +323,15 @@ class DblpReleaseImporter(EntityImporter): if err.status != 404: raise err + # Just skip all releases with an arxiv_id for now. Have not decided + # what to do about grouping works and lookup of un-versioned arxiv_id + # yet. Note that this means we will lack coverage of some works which + # have an arxiv preprint, but in those cases we will presumably at + # least have the pre-print copy/record. + if re.ext_ids.arxiv: + self.counts["skip-arxiv"] += 1 + return False + # then try other ext_id lookups if not existing: for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'): @@ -361,7 +370,7 @@ class DblpReleaseImporter(EntityImporter): return False # logic for whether to do update or skip - if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv_id: + if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv: self.counts['skip-update'] += 1 return False diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index cd3d53f6..04ff8db6 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -86,7 +86,7 @@ class IngestFileResultImporter(EntityImporter): self.counts['skip-ingest_request_source'] += 1 return False - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj'): + if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'): self.counts['skip-link-source'] += 1 return False @@ -437,7 +437,9 @@ class IngestWebResultImporter(IngestFileResultImporter): if 'revisit_cdx' in row: terminal_cdx = row['revisit_cdx'] assert terminal_cdx['surt'] - assert terminal_cdx['url'] == terminal['terminal_url'] + if terminal_cdx['url'] != terminal['terminal_url']: + self.counts['skip-terminal-url-mismatch'] += 1 + return None wc_cdx = [] # primary resource first |