From 91879651d7aa8a18a5fbd2b57dd60c171d6c8fba Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 May 2019 17:11:52 -0700 Subject: initial arxivraw importer (from parser) --- python/fatcat_tools/importers/__init__.py | 1 + python/fatcat_tools/importers/arxiv.py | 298 ++++++++++++++++++++++++++++++ 2 files changed, 299 insertions(+) create mode 100644 python/fatcat_tools/importers/arxiv.py (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 497946ea..8ec219f8 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -16,6 +16,7 @@ from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, Sqlit from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .jalc import JalcImporter from .jstor import JstorImporter +from .arxiv import ArxivRawImporter from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py new file mode 100644 index 00000000..c53e47f1 --- /dev/null +++ b/python/fatcat_tools/importers/arxiv.py @@ -0,0 +1,298 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from pylatexenc.latex2text import LatexNodes2Text + +import fatcat_client +from .common import EntityImporter, clean +from .crossref import lookup_license_slug + + +latex2text = LatexNodes2Text() + +def parse_arxiv_authors(raw): + if not raw: + return [] + authors = raw.split(', ') + if authors: + last = authors[-1].split(" and ") + if len(last) == 2: + authors[-1] = last[0] + authors.append(last[1]) + authors = [latex2text.latex_to_text(a).strip() for a in authors] + return authors + +def test_parse_arxiv_authors(): + + assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ + "Raphael Chetrite", + "Shamik Gupta", + "Izaak Neri", + "Édgar Roldán", + ] + assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [ + "Izaak Neri", + "Édgar Roldán", + ] + assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ + "Raphael Chetrite Shamik Gupta", + ] + + +class ArxivRawImporter(EntityImporter): + """ + Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities + + TODO: this will require a special importer that keeps works together + TODO: arxiv_id lookup in API (rust) with no version specified should select + the "most recent" version; can be a simple sort? + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of arxiv metadata via arXivRaw OAI-PMH feed") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter') + # lower batch size, because multiple versions per entry (guessing 2-3 on average?) + batch_size = kwargs.get('edit_batch_size', 50) + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + batch_size=batch_size, + **kwargs) + self._test_override = False + + + def parse_record(self, record): + + metadata = record.arXivRaw + extra = dict() + extra_arxiv = dict() + + base_id = metadata.id.string + doi = None + if metadata.doi and metadata.doi.string: + doi = metadata.doi.string.lower().strip() + assert doi.startswith('10.') + title = latex2text.latex_to_text(metadata.title.string) + authors = parse_arxiv_authors(metadata.authors.string) + contribs = [fatcat_client.ReleaseContrib(raw_name=a, role='author') for a in authors] + + lang = "en" # the vast majority in english + if metadata.comments and metadata.comments.string: + comments = metadata.comments.string.strip() + extra_arxiv['comments'] = comments + if 'in french' in comments.lower(): + lang = 'fr' + elif 'in spanish' in comments.lower(): + lang = 'es' + elif 'in portuguese' in comments.lower(): + lang = 'pt' + elif 'in hindi' in comments.lower(): + lang = 'hi' + elif 'in japanese' in comments.lower(): + lang = 'ja' + elif 'in german' in comments.lower(): + lang = 'de' + elif 'simplified chinese' in comments.lower(): + lang = 'zh' + elif 'in russian' in comments.lower(): + lang = 'ru' + # more languages? + + release_type = "article-journal" + + if metadata.find('journal-ref') and metadata.find('journal-ref').string: + journal_ref = metadata.find('journal-ref').string.strip() + extra_arxiv['journal_ref'] = journal_ref + if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): + release_type = "conference-paper" + if metadata.find('report-no') and metadata.find('report-no').string: + extra['number'] = metadata.find('report-no').string.strip() + release_type = "report" + if metadata.find('acm-class') and metadata.find('acm-class').string: + extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() + if metadata.categories and metadata.categories.string: + extra_arxiv['categories'] = metadata.categories.string.split() + license_slug = None + if metadata.license and metadata.license.string: + license_slug = lookup_license_slug(metadata.license.string) + abstracts = None + if metadata.abstract: + # TODO: test for this multi-abstract code path + abstracts = [] + abst = metadata.abstract.string.strip() + orig = None + if '-----' in abst: + both = abst.split('-----') + abst = both[0].strip() + orig = both[1].strip() + if '$' in abst or '{' in abst: + mime = "application/x-latex" + abst_plain = latex2text.latex_to_text(abst) + abstracts.append(fatcat_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en")) + else: + mime = "text/plain" + abstracts.append(fatcat_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) + if orig: + abstracts.append(fatcat_client.ReleaseAbstract(content=orig, mimetype=mime)) + # indicates that fulltext probably isn't english either + if lang == 'en': + lang = None + + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # arxiv: comments, categories, etc + extra_arxiv['base_id'] = base_id + extra['arxiv'] = extra_arxiv + + versions = [] + for version in metadata.find_all('version'): + arxiv_id = base_id + version['version'] + release_date = version.date.string.strip() + release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() + # XXX: source_type? + versions.append(fatcat_client.ReleaseEntity( + work_id=None, + title=title, + #original_title + version=version['version'], + release_type="article-journal", + release_stage='submitted', + release_date=release_date.isoformat(), + release_year=release_date.year, + ext_ids=fatcat_client.ReleaseExtIds( + arxiv=arxiv_id, + ), + language=lang, + license_slug=license_slug, + abstracts=abstracts, + contribs=contribs, + extra=extra, + )) + # TODO: assert that versions are actually in order + assert versions + + # only apply DOI to most recent version (HACK) + if doi: + versions[-1].ext_ids.doi = doi + versions[-1].release_stage = "published" + return versions + + def try_update(self, versions): + """ + This is pretty complex! There is no batch/bezerk mode for arxiv importer. + + For each version, do a lookup by full arxiv_id, and store work/release + id results. + + If a version has a DOI, also do a doi lookup and store that result. If + there is an existing release with both matching, set that as the + existing work. If they don't match, use the full arxiv_id match and + move on (maybe log or at least count the error?). If it's a + one/or/other case, update the existing release (and mark version as + existing). + + If there was any existing release, take its work_id. + + Iterate back through versions. If it didn't already exist, insert it + with any existing work_id. If there wasn't an existing work_id, lookup + the new release (by rev from edit?) and use that for the rest. + + Do not pass any versions on for batch insert. + """ + + # first do lookups + any_work_id = None + for v in versions: + v._existing_work_id = None + v._updated = False + existing = None + existing_doi = None + try: + existing = self.api.lookup_release(arxiv=v.ext_ids.arxiv) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + if v.ext_ids.doi: + try: + existing_doi = self.api.lookup_release(arxiv=v.ext_ids.arxiv) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + if existing_doi: + if existing and existing.ident == existing_doi.ident: + # great, they match and have idents, nothing to do + pass + elif existing and existing.ident != existing_doi.ident: + # could be bad, or could be that a new arxiv version was + # created (update?) + # stick with arxiv_id match as existing; don't update anything + pass + else: + assert not existing + if not existing_doi.ext_ids.arxiv_id: + # update the existing DOI-based record with our full arxiv_id + existing_doi.ext_ids.arxiv_id = v.ext_ids.arxiv_id + self.api.update_release(self.get_editgroup_id(), existing_doi.ident, existing_doi) + self.counts['update'] += 1 + # as a flag to not count below + v._updated = True + existing = existing_doi + + v._existing_work_id = existing.work_id + any_work_id = existing.work_id + + last_edit = None + for v in versions: + if v._existing_work_id: + if not v._updated: + self.counts['exists'] += 1 + continue + if not any_work_id and last_edit: + # fetch the last inserted release from this group + r = self.api.get_release_rev(last_edit.rev) + assert r.work_id + any_work_id = r.work_id + v.work_id = any_work_id + last_edit = self.api.insert_release(self.get_editgroup_id(), v) + self.counts['insert'] += 1 + + return False + + def insert_batch(self, batch_batch): + # there is no batch/bezerk mode for arxiv importer, except for testing + if self._test_override: + for batch in batch_batch: + self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( + editgroup=fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + self.counts['insert'] += len(batch) - 1 + else: + raise NotImplementedError() + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("record"): + resp = self.parse_record(article) + print(json.dumps(resp)) + #sys.exit(-1) + +if __name__=='__main__': + parser = ArxivRawImporter() + parser.parse_file(open(sys.argv[1])) -- cgit v1.2.3