diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-17 01:55:30 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-17 23:03:08 -0800 |
commit | 58ff361eb481bee9d2ef7249f48f94729d2a830d (patch) | |
tree | 9cbcbcf1c7eafa958c81d18fcf5b6a5d27f2f42d /python | |
parent | b2f53077bd05a536b5fdb551755a559b653421d3 (diff) | |
download | fatcat-58ff361eb481bee9d2ef7249f48f94729d2a830d.tar.gz fatcat-58ff361eb481bee9d2ef7249f48f94729d2a830d.zip |
very simple dblp container importer
Diffstat (limited to 'python')
-rwxr-xr-x | python/fatcat_import.py | 36 | ||||
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/dblp_container.py | 144 | ||||
-rw-r--r-- | python/tests/files/ISSN-to-ISSN-L.snip.txt | 1 | ||||
-rw-r--r-- | python/tests/files/dblp_container_map.tsv | 10 | ||||
-rw-r--r-- | python/tests/files/example_dblp_containers.json | 10 | ||||
-rw-r--r-- | python/tests/import_dblp.py | 61 |
7 files changed, 256 insertions, 7 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 90b53e5c..1dcfec21 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -274,19 +274,29 @@ def run_doaj_article(args): JsonLinePusher(dai, args.json_file).run() def run_dblp_release(args): - dwi = DblpReleaseImporter(args.api, + dri = DblpReleaseImporter(args.api, dblp_container_map_file=args.dblp_container_map_file, edit_batch_size=args.batch_size, do_updates=args.do_updates, dump_json_mode=args.dump_json_mode, ) Bs4XmlLargeFilePusher( - dwi, + dri, args.xml_file, DblpReleaseImporter.ELEMENT_TYPES, use_lxml=True, ).run() +def run_dblp_container(args): + dci = DblpContainerImporter(args.api, + args.issn_map_file, + dblp_container_map_file=args.dblp_container_map_file, + dblp_container_map_output=args.dblp_container_map_output, + edit_batch_size=args.batch_size, + do_updates=args.do_updates, + ) + JsonLinePusher(dci, args.json_file).run() + def run_file_meta(args): # do_updates defaults to true for this importer fmi = FileMetaImporter(args.api, @@ -675,6 +685,28 @@ def main(): auth_var="FATCAT_AUTH_WORKER_DBLP", ) + sub_dblp_container = subparsers.add_parser('dblp-container', + help="import dblp container metadata") + sub_dblp_container.add_argument('json_file', + help="File with DBLP container JSON to import from (see extra/dblp/)", + default=sys.stdin, type=argparse.FileType('rb')) + sub_dblp_container.add_argument('--dblp-container-map-file', + help="file path to dblp pre-existing prefix to container_id TSV file", + default=None, type=argparse.FileType('r')) + sub_dblp_container.add_argument('--dblp-container-map-output', + help="file path to output new dblp container map TSV to", + default=None, type=argparse.FileType('w')) + sub_dblp_container.add_argument('--issn-map-file', + help="ISSN to ISSN-L mapping file", + default=None, type=argparse.FileType('r')) + sub_dblp_container.add_argument('--do-updates', + action='store_true', + help="update any pre-existing container entities") + sub_dblp_container.set_defaults( + func=run_dblp_container, + auth_var="FATCAT_AUTH_WORKER_DBLP", + ) + sub_file_meta = subparsers.add_parser('file-meta', help="simple update-only importer for file metadata") sub_file_meta.set_defaults( diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index a14e2cec..6a2edeac 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -32,3 +32,4 @@ from .shadow import ShadowLibraryImporter from .file_meta import FileMetaImporter from .doaj_article import DoajArticleImporter from .dblp_release import DblpReleaseImporter +from .dblp_container import DblpContainerImporter diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py new file mode 100644 index 00000000..a9f993a8 --- /dev/null +++ b/python/fatcat_tools/importers/dblp_container.py @@ -0,0 +1,144 @@ + +""" +Importer for DBLP container-level (journal/conference/series) metadata, +pre-scraped in to JSON from HTML pages. +""" + +import sys # noqa: F401 + +import fatcat_openapi_client +from fatcat_tools.normal import clean_str +from fatcat_tools.importers.common import EntityImporter + + +class DblpContainerImporter(EntityImporter): + + def __init__(self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of container-level metadata scraped from dblp HTML") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DblpContainerImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.dblp_container_map_output = dblp_container_map_output + self.read_dblp_container_map_file(dblp_container_map_file) + self.read_issn_map_file(issn_map_file) + print("\t".join(["dblp_prefix", "container_id"]), file=self.dblp_container_map_output) + + def read_dblp_container_map_file(self, dblp_container_map_file) -> None: + self._dblp_container_map = dict() + print("Loading existing dblp prefix container map file...", file=sys.stderr) + for line in dblp_container_map_file: + if line.startswith("dblp_prefix") or len(line) == 0: + continue + (prefix, container_id) = line.split()[0:2] + assert len(container_id) == 26 + self._dblp_container_map[prefix] = container_id + print("\t".join([prefix, container_id]), file=self.dblp_container_map_output) + print("Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) + + def lookup_dblp_prefix(self, prefix): + if not prefix: + return None + return self._dblp_container_map.get(prefix) + + def want(self, raw_record): + return True + + def parse_record(self, row): + """ + row is a python dict (parsed from JSON). + + returns a ContainerEntity (or None if invalid or couldn't parse) + """ + + dblp_prefix = row.get('key') or row.get('dblp_prefix') + assert dblp_prefix + assert row['title'] + + container_type = None + if dblp_prefix.startswith('conf/'): + container_type = "conference-series" + elif dblp_prefix.startswith('journals/'): + container_type = "journal" + elif dblp_prefix.startswith('series/'): + container_type = "book-series" + + issnl = None + for issn in row.get('issns', []): + issnl = self.issn2issnl(issn) + if issnl: + break + + extra = { + 'dblp': { + 'prefix': dblp_prefix, + }, + } + + if row.get('homepage_url'): + extra['urls'] = [row['homepage_url']] + + if row.get('acronym'): + extra['acronym'] = row['acronym'] + + ce = fatcat_openapi_client.ContainerEntity( + name=clean_str(row['title']), + container_type=container_type, + issnl=issnl, + wikidata_qid=row.get('wikidata_qid'), + extra=extra, + ) + return ce + + def try_update(self, ce): + + dblp_prefix = ce.extra['dblp']['prefix'] + existing = None + existing_container_id = self.lookup_dblp_prefix(dblp_prefix) + if existing_container_id: + existing = self.api.get_container(existing_container_id) + if not existing and ce.issnl: + # check if existing by ISSN-L + try: + existing = self.api.lookup_container(issnl=ce.issnl) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + if not existing and ce.wikidata_qid: + try: + existing = self.api.lookup_container(wikidata_qid=ce.wikidata_qid) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + # TODO: plan to add a fuzzy match check here + + if not existing: + return True + + if existing: + self.counts['exists'] += 1 + print("\t".join([ce.extra['dblp']['prefix'], existing.ident]), file=self.dblp_container_map_output) + return False + + # shouldn't get here + raise NotImplementedError() + + def insert_batch(self, batch): + """ + Because we want to print a prefix/container_id match for each row, we + require a special batch insert method + """ + eg = self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + for c_edit in eg.edits.containers: + c = self.api.get_container(c_edit.ident) + print("\t".join([c.extra['dblp']['prefix'], c.ident]), file=self.dblp_container_map_output) diff --git a/python/tests/files/ISSN-to-ISSN-L.snip.txt b/python/tests/files/ISSN-to-ISSN-L.snip.txt index c97de279..2569c443 100644 --- a/python/tests/files/ISSN-to-ISSN-L.snip.txt +++ b/python/tests/files/ISSN-to-ISSN-L.snip.txt @@ -8,3 +8,4 @@ ISSN ISSN-L 0000-0094 0002-0094 0000-0108 0002-0108 0000-0140 0002-0140 +1877-3273 1877-3273 diff --git a/python/tests/files/dblp_container_map.tsv b/python/tests/files/dblp_container_map.tsv new file mode 100644 index 00000000..5f3dbacd --- /dev/null +++ b/python/tests/files/dblp_container_map.tsv @@ -0,0 +1,10 @@ +dblp_prefix container_id +series/atlantis 7xuzjpmxqnaq7enf3qc45diflu +series/apc ktlimnolmvbwbhthlj66zuooea +series/swb s2qx5lvrdngvrk6y2arrem3qom +series/sbece 7g52v6peqrek5l7zmstisnpkdm +series/wssris iszwbitlzfau5k77dpqifz5tf4 +series/lnm r436k67l3fdm5cgs5hqif5e7hi +conf/focs aaaaaaaaaaaaaeiraaaaaaaaai +conf/er aaaaaaaaaaaaaeiraaaaaaaaai +journals/cacm aaaaaaaaaaaaaeiraaaaaaaaai diff --git a/python/tests/files/example_dblp_containers.json b/python/tests/files/example_dblp_containers.json new file mode 100644 index 00000000..c5137604 --- /dev/null +++ b/python/tests/files/example_dblp_containers.json @@ -0,0 +1,10 @@ +{"homepage_url": "http://link.springer.com/bookseries/10077", "issns": ["1877-3273"], "key": "series/atlantis", "title": "Atlantis Thinking Machines"} +{"homepage_url": "http://ebooks.iospress.nl/bookseries/advances-in-parallel-computing", "issns": ["0927-5452", "1879-808X"], "key": "series/apc", "title": "Advances in Parallel Computing"} +{"homepage_url": "http://link.springer.com/bookseries/7056", "issns": ["1559-7474"], "key": "series/swb", "title": "Semantic Web and Beyond: Computing for Human Experience"} +{"homepage_url": "http://link.springer.com/bookseries/10059", "issns": ["2191-8112"], "key": "series/sbece", "title": "Springer Briefs in Electrical and Computer Engineering"} +{"homepage_url": "http://www.worldscientific.com/series/wssris", "issns": [], "key": "series/wssris", "title": "World Scientific Series in Robotics and Intelligent Systems"} +{"acronym": "LNM", "homepage_url": "http://link.springer.com/bookseries/304", "issns": ["0075-8434"], "key": "series/lnm", "title": "Lecture Notes in Mathematics"} +{"homepage_url": "https://www.usenix.org/lisa", "issns": [], "key": "series/sage", "title": "Short Topics in System Administration"} +{"homepage_url": "http://link.springer.com/bookseries/4205", "issns": ["2191-6586", "2191-6594"], "key": "series/acvpr", "title": "Advances in Computer Vision and Pattern Recognition"} +{"acronym": "STAR", "homepage_url": "http://link.springer.com/bookseries/5208", "issns": ["1610-7438", "1610-742X"], "key": "series/star", "title": "Springer Tracts in Advanced Robotics"} +{"acronym": "FAIA", "homepage_url": "http://ebooks.iospress.nl/bookseries/frontiers-in-artificial-intelligence-and-applications", "issns": ["0922-6389", "1879-8314"], "key": "series/faia", "title": "Frontiers in Artificial Intelligence and Applications"} diff --git a/python/tests/import_dblp.py b/python/tests/import_dblp.py index fd40eb06..5041b509 100644 --- a/python/tests/import_dblp.py +++ b/python/tests/import_dblp.py @@ -1,8 +1,9 @@ +import io import pytest from bs4 import BeautifulSoup -from fatcat_tools.importers import DblpReleaseImporter, Bs4XmlLargeFilePusher +from fatcat_tools.importers import DblpReleaseImporter, DblpContainerImporter, Bs4XmlLargeFilePusher, JsonLinePusher from fixtures import * @@ -13,15 +14,21 @@ def dblp_importer(api): @pytest.fixture(scope="function") def dblp_importer_existing(api): - with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield DblpReleaseImporter(api, issn_file, bezerk_mode=False, lookup_refs=True) + with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file: + yield DblpReleaseImporter(api, tsv_file, bezerk_mode=False) + +@pytest.fixture(scope="function") +def dblp_container_importer(api): + with open('tests/files/dblp_container_map.tsv', 'r') as tsv_file: + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DblpContainerImporter(api, issn_file, tsv_file, io.StringIO(), bezerk_mode=True) def test_dblp_importer(dblp_importer): last_index = dblp_importer.api.get_changelog(limit=1)[0].index with open('tests/files/example_dblp.xml', 'rb') as f: dblp_importer.bezerk_mode = True counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run() - print(counts) + #print(counts) assert counts['insert'] == 3 assert counts['exists'] == 0 assert counts['skip'] == 1 @@ -44,12 +51,56 @@ def test_dblp_importer(dblp_importer): dblp_importer.bezerk_mode = False dblp_importer.reset() counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run() - print(counts) + #print(counts) assert counts['insert'] == 0 assert counts['exists'] == 3 assert counts['skip'] == 1 assert last_index == dblp_importer.api.get_changelog(limit=1)[0].index +def test_dblp_container_importer(dblp_container_importer): + last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index + output_tsv_map = io.StringIO() + with open('tests/files/example_dblp_containers.json', 'r') as f: + dblp_container_importer.bezerk_mode = True + dblp_container_importer.dblp_container_map_output = output_tsv_map + counts = JsonLinePusher(dblp_container_importer, f).run() + assert counts['insert'] == 10 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = dblp_container_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "dblp" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.DblpContainerImporter" in eg.extra['agent'] + + # check that entity name mangling was fixed on import + eg = dblp_container_importer.api.get_editgroup(eg.editgroup_id) + container = dblp_container_importer.api.get_container(eg.edits.containers[0].ident) + assert container.name == "Atlantis Thinking Machines" + assert container.issnl == "1877-3273" + assert container.container_type == "book-series" + assert container.extra['dblp']['prefix'] == "series/atlantis" + assert container.extra['urls'] == ["http://link.springer.com/bookseries/10077"] + + last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index + output_tsv_map.seek(0) + print(output_tsv_map.read()) + output_tsv_map.seek(0) + with open('tests/files/example_dblp_containers.json', 'r') as f: + dblp_container_importer.reset() + dblp_container_importer.bezerk_mode = False + dblp_container_importer.dblp_container_map_output = io.StringIO() + dblp_container_importer.read_dblp_container_map_file(output_tsv_map) + counts = JsonLinePusher(dblp_container_importer, f).run() + print(counts) + assert counts['insert'] == 0 + assert counts['exists'] == 10 + assert counts['skip'] == 0 + assert last_index == dblp_container_importer.api.get_changelog(limit=1)[0].index + def test_dblp_xml_parse(dblp_importer): with open('tests/files/example_dblp_article.xml', 'r') as f: soup = BeautifulSoup(f, "xml") |