From 9451b3063c2d446748db74027c40c13ee69c24fb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 17 Dec 2020 01:56:27 -0800 Subject: improve dblp release import --- python/README_import.md | 11 +++++++++++ python/fatcat_tools/importers/dblp_release.py | 3 ++- python/tests/import_dblp.py | 7 ++++--- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'python') diff --git a/python/README_import.md b/python/README_import.md index 71b15eee..6853a4d7 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -133,3 +133,14 @@ Takes a few hours. export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_DOAJ) zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + +## dblp + +See `extra/dblp/README.md` for notes about first importing container metadata +and getting a TSV mapping flie to help with import. This is needed because +there is not (yet) a lookup mechanism for `dblp_prefix` as an identifier of +container entities. + + export FATCAT_AUTH_WORKER_DBLP=... + ./fatcat_import.py dblp-release --dblp-container-map-file /data/dblp/all_dblp_containers.tsv /data/dblp/dblp.xml + diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index e4d20370..9170dd2f 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -68,6 +68,7 @@ class DblpReleaseImporter(EntityImporter): if line.startswith("dblp_prefix") or len(line) == 0: continue (prefix, container_id) = line.split()[0:2] + container_id = container_id.strip() assert len(container_id) == 26 self._dblp_container_map[prefix] = container_id print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) @@ -310,7 +311,7 @@ class DblpReleaseImporter(EntityImporter): # then try other ext_id lookups if not existing: - for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv_id'): + for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'): extid_val = getattr(re.ext_ids, extid_type) if not extid_val: continue diff --git a/python/tests/import_dblp.py b/python/tests/import_dblp.py index 5041b509..d6b5878d 100644 --- a/python/tests/import_dblp.py +++ b/python/tests/import_dblp.py @@ -9,8 +9,8 @@ from fixtures import * @pytest.fixture(scope="function") def dblp_importer(api): - with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield DblpReleaseImporter(api, issn_file, bezerk_mode=True, lookup_refs=True) + with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file: + yield DblpReleaseImporter(api, tsv_file, bezerk_mode=True) @pytest.fixture(scope="function") def dblp_importer_existing(api): @@ -44,6 +44,7 @@ def test_dblp_importer(dblp_importer): # check that entity name mangling was fixed on import eg = dblp_importer.api.get_editgroup(eg.editgroup_id) release = dblp_importer.api.get_release(eg.edits.releases[0].ident) + assert release.contribs[0].raw_name == "Moira C. Norrie" assert release.contribs[1].raw_name == "Michael H. Böhlen" last_index = dblp_importer.api.get_changelog(limit=1)[0].index @@ -119,5 +120,5 @@ def test_dblp_xml_parse(dblp_importer): assert r1.issue == "11" assert r1.volume == "51" assert r1.release_year == 2008 - assert r1.extra['container_name'] == "Commun. ACM" + #assert r1.extra['container_name'] == "Commun. ACM" assert r1.extra['dblp']['type'] == "article" -- cgit v1.2.3