aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-15 17:11:52 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit91879651d7aa8a18a5fbd2b57dd60c171d6c8fba (patch)
treedf8d1fd330e41de9e0c9b0a7dcfa97a7dbe6cf02 /python/fatcat_tools
parent1b592132fe1a127368189e07bdbf9a16a807a284 (diff)
downloadfatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.tar.gz
fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.zip
initial arxivraw importer (from parser)
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/arxiv.py298
2 files changed, 299 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 497946ea..8ec219f8 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -16,6 +16,7 @@ from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, Sqlit
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
from .jalc import JalcImporter
from .jstor import JstorImporter
+from .arxiv import ArxivRawImporter
from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
new file mode 100644
index 00000000..c53e47f1
--- /dev/null
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -0,0 +1,298 @@
+
+import sys
+import json
+import datetime
+from bs4 import BeautifulSoup
+from pylatexenc.latex2text import LatexNodes2Text
+
+import fatcat_client
+from .common import EntityImporter, clean
+from .crossref import lookup_license_slug
+
+
+latex2text = LatexNodes2Text()
+
+def parse_arxiv_authors(raw):
+ if not raw:
+ return []
+ authors = raw.split(', ')
+ if authors:
+ last = authors[-1].split(" and ")
+ if len(last) == 2:
+ authors[-1] = last[0]
+ authors.append(last[1])
+ authors = [latex2text.latex_to_text(a).strip() for a in authors]
+ return authors
+
+def test_parse_arxiv_authors():
+
+ assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
+ "Raphael Chetrite",
+ "Shamik Gupta",
+ "Izaak Neri",
+ "Édgar Roldán",
+ ]
+ assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [
+ "Izaak Neri",
+ "Édgar Roldán",
+ ]
+ assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [
+ "Raphael Chetrite Shamik Gupta",
+ ]
+
+
+class ArxivRawImporter(EntityImporter):
+ """
+ Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities
+
+ TODO: this will require a special importer that keeps works together
+ TODO: arxiv_id lookup in API (rust) with no version specified should select
+ the "most recent" version; can be a simple sort?
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of arxiv metadata via arXivRaw OAI-PMH feed")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter')
+ # lower batch size, because multiple versions per entry (guessing 2-3 on average?)
+ batch_size = kwargs.get('edit_batch_size', 50)
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ batch_size=batch_size,
+ **kwargs)
+ self._test_override = False
+
+
+ def parse_record(self, record):
+
+ metadata = record.arXivRaw
+ extra = dict()
+ extra_arxiv = dict()
+
+ base_id = metadata.id.string
+ doi = None
+ if metadata.doi and metadata.doi.string:
+ doi = metadata.doi.string.lower().strip()
+ assert doi.startswith('10.')
+ title = latex2text.latex_to_text(metadata.title.string)
+ authors = parse_arxiv_authors(metadata.authors.string)
+ contribs = [fatcat_client.ReleaseContrib(raw_name=a, role='author') for a in authors]
+
+ lang = "en" # the vast majority in english
+ if metadata.comments and metadata.comments.string:
+ comments = metadata.comments.string.strip()
+ extra_arxiv['comments'] = comments
+ if 'in french' in comments.lower():
+ lang = 'fr'
+ elif 'in spanish' in comments.lower():
+ lang = 'es'
+ elif 'in portuguese' in comments.lower():
+ lang = 'pt'
+ elif 'in hindi' in comments.lower():
+ lang = 'hi'
+ elif 'in japanese' in comments.lower():
+ lang = 'ja'
+ elif 'in german' in comments.lower():
+ lang = 'de'
+ elif 'simplified chinese' in comments.lower():
+ lang = 'zh'
+ elif 'in russian' in comments.lower():
+ lang = 'ru'
+ # more languages?
+
+ release_type = "article-journal"
+
+ if metadata.find('journal-ref') and metadata.find('journal-ref').string:
+ journal_ref = metadata.find('journal-ref').string.strip()
+ extra_arxiv['journal_ref'] = journal_ref
+ if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
+ release_type = "conference-paper"
+ if metadata.find('report-no') and metadata.find('report-no').string:
+ extra['number'] = metadata.find('report-no').string.strip()
+ release_type = "report"
+ if metadata.find('acm-class') and metadata.find('acm-class').string:
+ extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip()
+ if metadata.categories and metadata.categories.string:
+ extra_arxiv['categories'] = metadata.categories.string.split()
+ license_slug = None
+ if metadata.license and metadata.license.string:
+ license_slug = lookup_license_slug(metadata.license.string)
+ abstracts = None
+ if metadata.abstract:
+ # TODO: test for this multi-abstract code path
+ abstracts = []
+ abst = metadata.abstract.string.strip()
+ orig = None
+ if '-----' in abst:
+ both = abst.split('-----')
+ abst = both[0].strip()
+ orig = both[1].strip()
+ if '$' in abst or '{' in abst:
+ mime = "application/x-latex"
+ abst_plain = latex2text.latex_to_text(abst)
+ abstracts.append(fatcat_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en"))
+ else:
+ mime = "text/plain"
+ abstracts.append(fatcat_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en"))
+ if orig:
+ abstracts.append(fatcat_client.ReleaseAbstract(content=orig, mimetype=mime))
+ # indicates that fulltext probably isn't english either
+ if lang == 'en':
+ lang = None
+
+
+ # extra:
+ # withdrawn_date
+ # translation_of
+ # subtitle
+ # aliases
+ # container_name
+ # group-title
+ # arxiv: comments, categories, etc
+ extra_arxiv['base_id'] = base_id
+ extra['arxiv'] = extra_arxiv
+
+ versions = []
+ for version in metadata.find_all('version'):
+ arxiv_id = base_id + version['version']
+ release_date = version.date.string.strip()
+ release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
+ # XXX: source_type?
+ versions.append(fatcat_client.ReleaseEntity(
+ work_id=None,
+ title=title,
+ #original_title
+ version=version['version'],
+ release_type="article-journal",
+ release_stage='submitted',
+ release_date=release_date.isoformat(),
+ release_year=release_date.year,
+ ext_ids=fatcat_client.ReleaseExtIds(
+ arxiv=arxiv_id,
+ ),
+ language=lang,
+ license_slug=license_slug,
+ abstracts=abstracts,
+ contribs=contribs,
+ extra=extra,
+ ))
+ # TODO: assert that versions are actually in order
+ assert versions
+
+ # only apply DOI to most recent version (HACK)
+ if doi:
+ versions[-1].ext_ids.doi = doi
+ versions[-1].release_stage = "published"
+ return versions
+
+ def try_update(self, versions):
+ """
+ This is pretty complex! There is no batch/bezerk mode for arxiv importer.
+
+ For each version, do a lookup by full arxiv_id, and store work/release
+ id results.
+
+ If a version has a DOI, also do a doi lookup and store that result. If
+ there is an existing release with both matching, set that as the
+ existing work. If they don't match, use the full arxiv_id match and
+ move on (maybe log or at least count the error?). If it's a
+ one/or/other case, update the existing release (and mark version as
+ existing).
+
+ If there was any existing release, take its work_id.
+
+ Iterate back through versions. If it didn't already exist, insert it
+ with any existing work_id. If there wasn't an existing work_id, lookup
+ the new release (by rev from edit?) and use that for the rest.
+
+ Do not pass any versions on for batch insert.
+ """
+
+ # first do lookups
+ any_work_id = None
+ for v in versions:
+ v._existing_work_id = None
+ v._updated = False
+ existing = None
+ existing_doi = None
+ try:
+ existing = self.api.lookup_release(arxiv=v.ext_ids.arxiv)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if v.ext_ids.doi:
+ try:
+ existing_doi = self.api.lookup_release(arxiv=v.ext_ids.arxiv)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if existing_doi:
+ if existing and existing.ident == existing_doi.ident:
+ # great, they match and have idents, nothing to do
+ pass
+ elif existing and existing.ident != existing_doi.ident:
+ # could be bad, or could be that a new arxiv version was
+ # created (update?)
+ # stick with arxiv_id match as existing; don't update anything
+ pass
+ else:
+ assert not existing
+ if not existing_doi.ext_ids.arxiv_id:
+ # update the existing DOI-based record with our full arxiv_id
+ existing_doi.ext_ids.arxiv_id = v.ext_ids.arxiv_id
+ self.api.update_release(self.get_editgroup_id(), existing_doi.ident, existing_doi)
+ self.counts['update'] += 1
+ # as a flag to not count below
+ v._updated = True
+ existing = existing_doi
+
+ v._existing_work_id = existing.work_id
+ any_work_id = existing.work_id
+
+ last_edit = None
+ for v in versions:
+ if v._existing_work_id:
+ if not v._updated:
+ self.counts['exists'] += 1
+ continue
+ if not any_work_id and last_edit:
+ # fetch the last inserted release from this group
+ r = self.api.get_release_rev(last_edit.rev)
+ assert r.work_id
+ any_work_id = r.work_id
+ v.work_id = any_work_id
+ last_edit = self.api.insert_release(self.get_editgroup_id(), v)
+ self.counts['insert'] += 1
+
+ return False
+
+ def insert_batch(self, batch_batch):
+ # there is no batch/bezerk mode for arxiv importer, except for testing
+ if self._test_override:
+ for batch in batch_batch:
+ self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+ editgroup=fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+ self.counts['insert'] += len(batch) - 1
+ else:
+ raise NotImplementedError()
+
+ def parse_file(self, handle):
+
+ # 1. open with beautiful soup
+ soup = BeautifulSoup(handle, "xml")
+
+ # 2. iterate over articles, call parse_article on each
+ for article in soup.find_all("record"):
+ resp = self.parse_record(article)
+ print(json.dumps(resp))
+ #sys.exit(-1)
+
+if __name__=='__main__':
+ parser = ArxivRawImporter()
+ parser.parse_file(open(sys.argv[1]))