summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-17 01:56:27 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-17 23:03:08 -0800
commit9451b3063c2d446748db74027c40c13ee69c24fb (patch)
treea62a4170469b9d8d9bfeec540a7e8c75f1a704a2 /python
parent58ff361eb481bee9d2ef7249f48f94729d2a830d (diff)
downloadfatcat-9451b3063c2d446748db74027c40c13ee69c24fb.tar.gz
fatcat-9451b3063c2d446748db74027c40c13ee69c24fb.zip
improve dblp release import
Diffstat (limited to 'python')
-rw-r--r--python/README_import.md11
-rw-r--r--python/fatcat_tools/importers/dblp_release.py3
-rw-r--r--python/tests/import_dblp.py7
3 files changed, 17 insertions, 4 deletions
diff --git a/python/README_import.md b/python/README_import.md
index 71b15eee..6853a4d7 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -133,3 +133,14 @@ Takes a few hours.
export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_DOAJ)
zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+
+## dblp
+
+See `extra/dblp/README.md` for notes about first importing container metadata
+and getting a TSV mapping flie to help with import. This is needed because
+there is not (yet) a lookup mechanism for `dblp_prefix` as an identifier of
+container entities.
+
+ export FATCAT_AUTH_WORKER_DBLP=...
+ ./fatcat_import.py dblp-release --dblp-container-map-file /data/dblp/all_dblp_containers.tsv /data/dblp/dblp.xml
+
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index e4d20370..9170dd2f 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -68,6 +68,7 @@ class DblpReleaseImporter(EntityImporter):
if line.startswith("dblp_prefix") or len(line) == 0:
continue
(prefix, container_id) = line.split()[0:2]
+ container_id = container_id.strip()
assert len(container_id) == 26
self._dblp_container_map[prefix] = container_id
print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
@@ -310,7 +311,7 @@ class DblpReleaseImporter(EntityImporter):
# then try other ext_id lookups
if not existing:
- for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv_id'):
+ for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'):
extid_val = getattr(re.ext_ids, extid_type)
if not extid_val:
continue
diff --git a/python/tests/import_dblp.py b/python/tests/import_dblp.py
index 5041b509..d6b5878d 100644
--- a/python/tests/import_dblp.py
+++ b/python/tests/import_dblp.py
@@ -9,8 +9,8 @@ from fixtures import *
@pytest.fixture(scope="function")
def dblp_importer(api):
- with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield DblpReleaseImporter(api, issn_file, bezerk_mode=True, lookup_refs=True)
+ with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file:
+ yield DblpReleaseImporter(api, tsv_file, bezerk_mode=True)
@pytest.fixture(scope="function")
def dblp_importer_existing(api):
@@ -44,6 +44,7 @@ def test_dblp_importer(dblp_importer):
# check that entity name mangling was fixed on import
eg = dblp_importer.api.get_editgroup(eg.editgroup_id)
release = dblp_importer.api.get_release(eg.edits.releases[0].ident)
+ assert release.contribs[0].raw_name == "Moira C. Norrie"
assert release.contribs[1].raw_name == "Michael H. Böhlen"
last_index = dblp_importer.api.get_changelog(limit=1)[0].index
@@ -119,5 +120,5 @@ def test_dblp_xml_parse(dblp_importer):
assert r1.issue == "11"
assert r1.volume == "51"
assert r1.release_year == 2008
- assert r1.extra['container_name'] == "Commun. ACM"
+ #assert r1.extra['container_name'] == "Commun. ACM"
assert r1.extra['dblp']['type'] == "article"