diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 11 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/ingest.py | 6 | ||||
| -rw-r--r-- | python/fatcat_tools/normal.py | 4 | 
4 files changed, 19 insertions, 4 deletions
| diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2446cdbf..fcbe9ad2 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -164,7 +164,7 @@ class EntityImporter:          self.es_client = kwargs.get('es_client')          if not self.es_client: -            self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") +            self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120)          self._issnl_id_map = dict()          self._orcid_id_map = dict() diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 5cbc95d0..daecd765 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -323,6 +323,15 @@ class DblpReleaseImporter(EntityImporter):              if err.status != 404:                  raise err +        # Just skip all releases with an arxiv_id for now. Have not decided +        # what to do about grouping works and lookup of un-versioned arxiv_id +        # yet. Note that this means we will lack coverage of some works which +        # have an arxiv preprint, but in those cases we will presumably at +        # least have the pre-print copy/record. +        if re.ext_ids.arxiv: +            self.counts["skip-arxiv"] += 1 +            return False +          # then try other ext_id lookups          if not existing:              for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'): @@ -361,7 +370,7 @@ class DblpReleaseImporter(EntityImporter):              return False          # logic for whether to do update or skip -        if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv_id: +        if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv:              self.counts['skip-update'] += 1              return False diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index cd3d53f6..04ff8db6 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -86,7 +86,7 @@ class IngestFileResultImporter(EntityImporter):              self.counts['skip-ingest_request_source'] += 1              return False -        if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj'): +        if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'):              self.counts['skip-link-source'] += 1              return False @@ -437,7 +437,9 @@ class IngestWebResultImporter(IngestFileResultImporter):          if 'revisit_cdx' in row:              terminal_cdx = row['revisit_cdx']          assert terminal_cdx['surt'] -        assert terminal_cdx['url'] == terminal['terminal_url'] +        if terminal_cdx['url'] != terminal['terminal_url']: +            self.counts['skip-terminal-url-mismatch'] += 1 +            return None          wc_cdx = []          # primary resource first diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index d792979d..4218856c 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -94,6 +94,9 @@ def clean_arxiv_id(raw):      - 'arxiv:' prefix      Works with versioned or un-versioned arxiv identifiers. + +    TODO: version of this function that only works with versioned identifiers? +    That is the behavior of fatcat API      """      if not raw:          return None @@ -116,6 +119,7 @@ def test_clean_arxiv_id():      assert clean_arxiv_id("math.CA/0611800v2") == "math.CA/0611800v2"      assert clean_arxiv_id("math.CA/0611800") == "math.CA/0611800"      assert clean_arxiv_id("0806.2878v1 ") == "0806.2878v1" +    assert clean_arxiv_id("cs/0207047") == "cs/0207047"      assert clean_arxiv_id("https://arxiv.org/abs/0806.2878v1") == "0806.2878v1"      assert clean_arxiv_id("arxiv:0806.2878v1") == "0806.2878v1" | 
