diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-15 17:11:52 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 |
commit | 91879651d7aa8a18a5fbd2b57dd60c171d6c8fba (patch) | |
tree | df8d1fd330e41de9e0c9b0a7dcfa97a7dbe6cf02 | |
parent | 1b592132fe1a127368189e07bdbf9a16a807a284 (diff) | |
download | fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.tar.gz fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.zip |
initial arxivraw importer (from parser)
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 298 | ||||
-rw-r--r-- | python/parse_arxivraw_xml.py | 198 | ||||
-rw-r--r-- | python/tests/import_arxiv.py | 96 |
4 files changed, 395 insertions, 198 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 497946ea..8ec219f8 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -16,6 +16,7 @@ from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, Sqlit from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .jalc import JalcImporter from .jstor import JstorImporter +from .arxiv import ArxivRawImporter from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py new file mode 100644 index 00000000..c53e47f1 --- /dev/null +++ b/python/fatcat_tools/importers/arxiv.py @@ -0,0 +1,298 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from pylatexenc.latex2text import LatexNodes2Text + +import fatcat_client +from .common import EntityImporter, clean +from .crossref import lookup_license_slug + + +latex2text = LatexNodes2Text() + +def parse_arxiv_authors(raw): + if not raw: + return [] + authors = raw.split(', ') + if authors: + last = authors[-1].split(" and ") + if len(last) == 2: + authors[-1] = last[0] + authors.append(last[1]) + authors = [latex2text.latex_to_text(a).strip() for a in authors] + return authors + +def test_parse_arxiv_authors(): + + assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ + "Raphael Chetrite", + "Shamik Gupta", + "Izaak Neri", + "Édgar Roldán", + ] + assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [ + "Izaak Neri", + "Édgar Roldán", + ] + assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ + "Raphael Chetrite Shamik Gupta", + ] + + +class ArxivRawImporter(EntityImporter): + """ + Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities + + TODO: this will require a special importer that keeps works together + TODO: arxiv_id lookup in API (rust) with no version specified should select + the "most recent" version; can be a simple sort? + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of arxiv metadata via arXivRaw OAI-PMH feed") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter') + # lower batch size, because multiple versions per entry (guessing 2-3 on average?) + batch_size = kwargs.get('edit_batch_size', 50) + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + batch_size=batch_size, + **kwargs) + self._test_override = False + + + def parse_record(self, record): + + metadata = record.arXivRaw + extra = dict() + extra_arxiv = dict() + + base_id = metadata.id.string + doi = None + if metadata.doi and metadata.doi.string: + doi = metadata.doi.string.lower().strip() + assert doi.startswith('10.') + title = latex2text.latex_to_text(metadata.title.string) + authors = parse_arxiv_authors(metadata.authors.string) + contribs = [fatcat_client.ReleaseContrib(raw_name=a, role='author') for a in authors] + + lang = "en" # the vast majority in english + if metadata.comments and metadata.comments.string: + comments = metadata.comments.string.strip() + extra_arxiv['comments'] = comments + if 'in french' in comments.lower(): + lang = 'fr' + elif 'in spanish' in comments.lower(): + lang = 'es' + elif 'in portuguese' in comments.lower(): + lang = 'pt' + elif 'in hindi' in comments.lower(): + lang = 'hi' + elif 'in japanese' in comments.lower(): + lang = 'ja' + elif 'in german' in comments.lower(): + lang = 'de' + elif 'simplified chinese' in comments.lower(): + lang = 'zh' + elif 'in russian' in comments.lower(): + lang = 'ru' + # more languages? + + release_type = "article-journal" + + if metadata.find('journal-ref') and metadata.find('journal-ref').string: + journal_ref = metadata.find('journal-ref').string.strip() + extra_arxiv['journal_ref'] = journal_ref + if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): + release_type = "conference-paper" + if metadata.find('report-no') and metadata.find('report-no').string: + extra['number'] = metadata.find('report-no').string.strip() + release_type = "report" + if metadata.find('acm-class') and metadata.find('acm-class').string: + extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() + if metadata.categories and metadata.categories.string: + extra_arxiv['categories'] = metadata.categories.string.split() + license_slug = None + if metadata.license and metadata.license.string: + license_slug = lookup_license_slug(metadata.license.string) + abstracts = None + if metadata.abstract: + # TODO: test for this multi-abstract code path + abstracts = [] + abst = metadata.abstract.string.strip() + orig = None + if '-----' in abst: + both = abst.split('-----') + abst = both[0].strip() + orig = both[1].strip() + if '$' in abst or '{' in abst: + mime = "application/x-latex" + abst_plain = latex2text.latex_to_text(abst) + abstracts.append(fatcat_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en")) + else: + mime = "text/plain" + abstracts.append(fatcat_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) + if orig: + abstracts.append(fatcat_client.ReleaseAbstract(content=orig, mimetype=mime)) + # indicates that fulltext probably isn't english either + if lang == 'en': + lang = None + + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # arxiv: comments, categories, etc + extra_arxiv['base_id'] = base_id + extra['arxiv'] = extra_arxiv + + versions = [] + for version in metadata.find_all('version'): + arxiv_id = base_id + version['version'] + release_date = version.date.string.strip() + release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() + # XXX: source_type? + versions.append(fatcat_client.ReleaseEntity( + work_id=None, + title=title, + #original_title + version=version['version'], + release_type="article-journal", + release_stage='submitted', + release_date=release_date.isoformat(), + release_year=release_date.year, + ext_ids=fatcat_client.ReleaseExtIds( + arxiv=arxiv_id, + ), + language=lang, + license_slug=license_slug, + abstracts=abstracts, + contribs=contribs, + extra=extra, + )) + # TODO: assert that versions are actually in order + assert versions + + # only apply DOI to most recent version (HACK) + if doi: + versions[-1].ext_ids.doi = doi + versions[-1].release_stage = "published" + return versions + + def try_update(self, versions): + """ + This is pretty complex! There is no batch/bezerk mode for arxiv importer. + + For each version, do a lookup by full arxiv_id, and store work/release + id results. + + If a version has a DOI, also do a doi lookup and store that result. If + there is an existing release with both matching, set that as the + existing work. If they don't match, use the full arxiv_id match and + move on (maybe log or at least count the error?). If it's a + one/or/other case, update the existing release (and mark version as + existing). + + If there was any existing release, take its work_id. + + Iterate back through versions. If it didn't already exist, insert it + with any existing work_id. If there wasn't an existing work_id, lookup + the new release (by rev from edit?) and use that for the rest. + + Do not pass any versions on for batch insert. + """ + + # first do lookups + any_work_id = None + for v in versions: + v._existing_work_id = None + v._updated = False + existing = None + existing_doi = None + try: + existing = self.api.lookup_release(arxiv=v.ext_ids.arxiv) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + if v.ext_ids.doi: + try: + existing_doi = self.api.lookup_release(arxiv=v.ext_ids.arxiv) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + if existing_doi: + if existing and existing.ident == existing_doi.ident: + # great, they match and have idents, nothing to do + pass + elif existing and existing.ident != existing_doi.ident: + # could be bad, or could be that a new arxiv version was + # created (update?) + # stick with arxiv_id match as existing; don't update anything + pass + else: + assert not existing + if not existing_doi.ext_ids.arxiv_id: + # update the existing DOI-based record with our full arxiv_id + existing_doi.ext_ids.arxiv_id = v.ext_ids.arxiv_id + self.api.update_release(self.get_editgroup_id(), existing_doi.ident, existing_doi) + self.counts['update'] += 1 + # as a flag to not count below + v._updated = True + existing = existing_doi + + v._existing_work_id = existing.work_id + any_work_id = existing.work_id + + last_edit = None + for v in versions: + if v._existing_work_id: + if not v._updated: + self.counts['exists'] += 1 + continue + if not any_work_id and last_edit: + # fetch the last inserted release from this group + r = self.api.get_release_rev(last_edit.rev) + assert r.work_id + any_work_id = r.work_id + v.work_id = any_work_id + last_edit = self.api.insert_release(self.get_editgroup_id(), v) + self.counts['insert'] += 1 + + return False + + def insert_batch(self, batch_batch): + # there is no batch/bezerk mode for arxiv importer, except for testing + if self._test_override: + for batch in batch_batch: + self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( + editgroup=fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + self.counts['insert'] += len(batch) - 1 + else: + raise NotImplementedError() + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("record"): + resp = self.parse_record(article) + print(json.dumps(resp)) + #sys.exit(-1) + +if __name__=='__main__': + parser = ArxivRawImporter() + parser.parse_file(open(sys.argv[1])) diff --git a/python/parse_arxivraw_xml.py b/python/parse_arxivraw_xml.py deleted file mode 100644 index 9b9f28c9..00000000 --- a/python/parse_arxivraw_xml.py +++ /dev/null @@ -1,198 +0,0 @@ - -import sys -import json -import datetime -from bs4 import BeautifulSoup -from bs4.element import NavigableString -from pylatexenc.latex2text import LatexNodes2Text - - -latex2text = LatexNodes2Text() - -def parse_arxiv_authors(raw): - if not raw: - return [] - authors = raw.split(', ') - if authors: - last = authors[-1].split(" and ") - if len(last) == 2: - authors[-1] = last[0] - authors.append(last[1]) - authors = [latex2text.latex_to_text(a).strip() for a in authors] - return authors - -def test_parse_arxiv_authors(): - - assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ - "Raphael Chetrite", - "Shamik Gupta", - "Izaak Neri", - "Édgar Roldán", - ] - assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [ - "Izaak Neri", - "Édgar Roldán", - ] - assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ - "Raphael Chetrite Shamik Gupta", - ] - -class ArxivRawXmlParser(): - """ - Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities - - TODO: this will require a special importer that keeps works together - TODO: arxiv_id lookup in API (rust) with no version specified should select - the "most recent" version; can be a simple sort? - """ - - def __init__(self): - pass - - def parse_file(self, handle): - - # 1. open with beautiful soup - soup = BeautifulSoup(handle, "xml") - - # 2. iterate over articles, call parse_article on each - for article in soup.find_all("record"): - resp = self.parse_record(article) - print(json.dumps(resp)) - #sys.exit(-1) - - - def parse_record(self, record): - - metadata = record.arXivRaw - extra = dict() - extra_arxiv = dict() - - base_id = metadata.id.string - doi = None - if metadata.doi and metadata.doi.string: - doi = metadata.doi.string.lower().strip() - assert doi.startswith('10.') - title = latex2text.latex_to_text(metadata.title.string) - authors = parse_arxiv_authors(metadata.authors.string) - contribs = [dict(raw_name=a, role='author') for a in authors] - - lang = "en" # the vast majority in english - if metadata.comments and metadata.comments.string: - comments = metadata.comments.string.strip() - extra_arxiv['comments'] = comments - if 'in french' in comments.lower(): - lang = 'fr' - elif 'in spanish' in comments.lower(): - lang = 'es' - elif 'in portuguese' in comments.lower(): - lang = 'pt' - elif 'in hindi' in comments.lower(): - lang = 'hi' - elif 'in japanese' in comments.lower(): - lang = 'ja' - elif 'in german' in comments.lower(): - lang = 'de' - elif 'simplified chinese' in comments.lower(): - lang = 'zh' - elif 'in russian' in comments.lower(): - lang = 'ru' - # more languages? - - release_type = "article-journal" - - if metadata.find('journal-ref') and metadata.find('journal-ref').string: - journal_ref = metadata.find('journal-ref').string.strip() - extra_arxiv['journal_ref'] = journal_ref - if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): - release_type = "conference-paper" - if metadata.find('report-no') and metadata.find('report-no').string: - extra['number'] = metadata.find('report-no').string.strip() - release_type = "report" - if metadata.find('acm-class') and metadata.find('acm-class').string: - extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() - if metadata.categories and metadata.categories.string: - extra_arxiv['categories'] = metadata.categories.string.split() - license_slug = None - if metadata.license and metadata.license.string: - # XXX: convert URL to slug - license_slug = metadata.license.string.strip() - abstracts = None - if metadata.abstract: - # TODO: test for this multi-abstract code path - abstracts = [] - abst = metadata.abstract.string.strip() - orig = None - if '-----' in abst: - both = abst.split('-----') - abst = both[0].strip() - orig = both[1].strip() - if '$' in abst or '{' in abst: - mime = "application/x-latex" - abst_plain = latex2text.latex_to_text(abst) - abstracts.append(dict(content=abst_plain, mime="text/plain", lang="en")) - else: - mime = "text/plain" - abstracts.append(dict(content=abst, mime=mime, lang="en")) - if orig: - abstracts.append(dict(content=orig, mime=mime)) - - if extra_arxiv: - extra['arxiv'] = extra_arxiv - if not extra: - extra = None - - versions = [] - for version in metadata.find_all('version'): - arxiv_id = base_id + version['version'] - release_date = version.date.string.strip() - release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() - versions.append(dict( - work_id=None, - title=title, - #original_title - release_type="article-journal", - release_status='submitted', # XXX: source_type? - release_date=release_date.isoformat(), - release_year=release_date.year, - arxiv_id=arxiv_id, - #doi (see below) - #pmid - #pmcid - #isbn13 # never in Article - #volume - #issue - #pages - #publisher - language=lang, - #license_slug # not in MEDLINE - - # content, mimetype, lang - abstracts=abstracts, - - # raw_name, role, raw_affiliation, extra - contribs=contribs, - - # name, type, publisher, issnl - # extra: issnp, issne, original_name, languages, country - #container=container, # very little/none; resolve via DOI? - - # extra: - # withdrawn_date - # translation_of - # subtitle - # aliases - # container_name - # group-title - # pubmed: retraction refs - extra=extra, - )) - - # only apply DOI to most recent version (HACK) - if doi: - versions[-1]['doi'] = doi - versions[-1]['release_status'] = "published" - return base_id, versions - -if __name__=='__main__': - parser = ArxivRawXmlParser() - parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py new file mode 100644 index 00000000..726bafc5 --- /dev/null +++ b/python/tests/import_arxiv.py @@ -0,0 +1,96 @@ + +import json, gzip +import pytest +from fatcat_tools.importers import ArxivRawImporter, Bs4XmlFilePusher +from fixtures import api +from bs4 import BeautifulSoup + + +@pytest.fixture(scope="function") +def arxiv_importer(api): + ari = ArxivRawImporter(api, bezerk_mode=True) + ari._test_override = True + return ari + +def test_arxiv_importer(arxiv_importer): + last_index = arxiv_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/arxivraw_1810.09584.xml', 'r') as f: + arxiv_importer.bezerk_mode = True + counts = Bs4XmlFilePusher(arxiv_importer, f, "record").run() + assert counts['insert'] == 2 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = arxiv_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "arxiv" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.ArxivRawImporter" in eg.extra['agent'] + + last_index = arxiv_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/arxivraw_1810.09584.xml', 'r') as f: + arxiv_importer.bezerk_mode = False + arxiv_importer.reset() + counts = Bs4XmlFilePusher(arxiv_importer, f, "record").run() + assert counts['insert'] == 0 + assert counts['exists'] == 2 + assert counts['skip'] == 0 + assert last_index == arxiv_importer.api.get_changelog(limit=1)[0].index + +def test_arxiv_xml_parse(arxiv_importer): + with open('tests/files/arxivraw_1810.09584.xml', 'r') as f: + soup = BeautifulSoup(f, "xml") + r = arxiv_importer.parse_record(soup.find_all("record")[0]) + + r1 = r[0] + r2 = r[1] + print(r1.extra) + print(r2.extra) + assert r1.work_id == r2.work_id + assert r1.title == "Martingale theory for housekeeping heat" + assert r1.subtitle == None + assert r1.original_title == None + assert r1.release_type == "article-journal" + assert r1.release_stage == "submitted" + assert r2.release_stage == "published" + assert r1.license_slug == "ARXIV-NED-1.0" + assert r1.version == "v1" + assert r2.version == "v2" + assert r1.ext_ids.arxiv == "1810.09584v1" + assert r2.ext_ids.arxiv == "1810.09584v2" + assert r1.ext_ids.doi == None + assert r2.ext_ids.doi == "10.1209/0295-5075/124/60006" + assert r1.release_year == 2018 + assert str(r1.release_date) == "2018-10-22" + assert r2.release_year == 2019 + assert str(r2.release_date) == "2019-01-13" + # matched by ISSN, so shouldn't be in there? + #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London" + assert len(r1.contribs) == 4 + # XXX: extra['arxiv'] stuff + + assert r1.contribs[0].raw_name == "Raphael Chetrite" + assert r1.contribs[0].role == "author" + assert r1.contribs[1].raw_name == "Shamik Gupta" + assert r1.contribs[2].raw_name == "Izaak Neri" + assert r1.contribs[3].raw_name == "Édgar Roldán" + assert r1.contribs[3].role == "author" + + assert len(r1.contribs) == 4 + assert r1.contribs == r2.contribs + + assert r1.abstracts[0].content.startswith("The housekeeping heat is the energy exchanged") + # order isn't deterministic + assert "application/x-latex" in [a.mimetype for a in r1.abstracts] + assert "text/plain" in [a.mimetype for a in r1.abstracts] + + assert r1.abstracts == r2.abstracts + + assert r1.extra['arxiv']['comments'] == "7 pages, 2 figures" + assert r1.extra['arxiv']['categories'] == ["cond-mat.stat-mech", "physics.bio-ph", "physics.data-an"] + + assert r1.extra == r2.extra + + assert not r1.refs |