aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-15 17:11:52 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit91879651d7aa8a18a5fbd2b57dd60c171d6c8fba (patch)
treedf8d1fd330e41de9e0c9b0a7dcfa97a7dbe6cf02
parent1b592132fe1a127368189e07bdbf9a16a807a284 (diff)
downloadfatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.tar.gz
fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.zip
initial arxivraw importer (from parser)
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/arxiv.py298
-rw-r--r--python/parse_arxivraw_xml.py198
-rw-r--r--python/tests/import_arxiv.py96
4 files changed, 395 insertions, 198 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 497946ea..8ec219f8 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -16,6 +16,7 @@ from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, Sqlit
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
from .jalc import JalcImporter
from .jstor import JstorImporter
+from .arxiv import ArxivRawImporter
from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
new file mode 100644
index 00000000..c53e47f1
--- /dev/null
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -0,0 +1,298 @@
+
+import sys
+import json
+import datetime
+from bs4 import BeautifulSoup
+from pylatexenc.latex2text import LatexNodes2Text
+
+import fatcat_client
+from .common import EntityImporter, clean
+from .crossref import lookup_license_slug
+
+
+latex2text = LatexNodes2Text()
+
+def parse_arxiv_authors(raw):
+ if not raw:
+ return []
+ authors = raw.split(', ')
+ if authors:
+ last = authors[-1].split(" and ")
+ if len(last) == 2:
+ authors[-1] = last[0]
+ authors.append(last[1])
+ authors = [latex2text.latex_to_text(a).strip() for a in authors]
+ return authors
+
+def test_parse_arxiv_authors():
+
+ assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
+ "Raphael Chetrite",
+ "Shamik Gupta",
+ "Izaak Neri",
+ "Édgar Roldán",
+ ]
+ assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [
+ "Izaak Neri",
+ "Édgar Roldán",
+ ]
+ assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [
+ "Raphael Chetrite Shamik Gupta",
+ ]
+
+
+class ArxivRawImporter(EntityImporter):
+ """
+ Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities
+
+ TODO: this will require a special importer that keeps works together
+ TODO: arxiv_id lookup in API (rust) with no version specified should select
+ the "most recent" version; can be a simple sort?
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of arxiv metadata via arXivRaw OAI-PMH feed")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter')
+ # lower batch size, because multiple versions per entry (guessing 2-3 on average?)
+ batch_size = kwargs.get('edit_batch_size', 50)
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ batch_size=batch_size,
+ **kwargs)
+ self._test_override = False
+
+
+ def parse_record(self, record):
+
+ metadata = record.arXivRaw
+ extra = dict()
+ extra_arxiv = dict()
+
+ base_id = metadata.id.string
+ doi = None
+ if metadata.doi and metadata.doi.string:
+ doi = metadata.doi.string.lower().strip()
+ assert doi.startswith('10.')
+ title = latex2text.latex_to_text(metadata.title.string)
+ authors = parse_arxiv_authors(metadata.authors.string)
+ contribs = [fatcat_client.ReleaseContrib(raw_name=a, role='author') for a in authors]
+
+ lang = "en" # the vast majority in english
+ if metadata.comments and metadata.comments.string:
+ comments = metadata.comments.string.strip()
+ extra_arxiv['comments'] = comments
+ if 'in french' in comments.lower():
+ lang = 'fr'
+ elif 'in spanish' in comments.lower():
+ lang = 'es'
+ elif 'in portuguese' in comments.lower():
+ lang = 'pt'
+ elif 'in hindi' in comments.lower():
+ lang = 'hi'
+ elif 'in japanese' in comments.lower():
+ lang = 'ja'
+ elif 'in german' in comments.lower():
+ lang = 'de'
+ elif 'simplified chinese' in comments.lower():
+ lang = 'zh'
+ elif 'in russian' in comments.lower():
+ lang = 'ru'
+ # more languages?
+
+ release_type = "article-journal"
+
+ if metadata.find('journal-ref') and metadata.find('journal-ref').string:
+ journal_ref = metadata.find('journal-ref').string.strip()
+ extra_arxiv['journal_ref'] = journal_ref
+ if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
+ release_type = "conference-paper"
+ if metadata.find('report-no') and metadata.find('report-no').string:
+ extra['number'] = metadata.find('report-no').string.strip()
+ release_type = "report"
+ if metadata.find('acm-class') and metadata.find('acm-class').string:
+ extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip()
+ if metadata.categories and metadata.categories.string:
+ extra_arxiv['categories'] = metadata.categories.string.split()
+ license_slug = None
+ if metadata.license and metadata.license.string:
+ license_slug = lookup_license_slug(metadata.license.string)
+ abstracts = None
+ if metadata.abstract:
+ # TODO: test for this multi-abstract code path
+ abstracts = []
+ abst = metadata.abstract.string.strip()
+ orig = None
+ if '-----' in abst:
+ both = abst.split('-----')
+ abst = both[0].strip()
+ orig = both[1].strip()
+ if '$' in abst or '{' in abst:
+ mime = "application/x-latex"
+ abst_plain = latex2text.latex_to_text(abst)
+ abstracts.append(fatcat_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en"))
+ else:
+ mime = "text/plain"
+ abstracts.append(fatcat_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en"))
+ if orig:
+ abstracts.append(fatcat_client.ReleaseAbstract(content=orig, mimetype=mime))
+ # indicates that fulltext probably isn't english either
+ if lang == 'en':
+ lang = None
+
+
+ # extra:
+ # withdrawn_date
+ # translation_of
+ # subtitle
+ # aliases
+ # container_name
+ # group-title
+ # arxiv: comments, categories, etc
+ extra_arxiv['base_id'] = base_id
+ extra['arxiv'] = extra_arxiv
+
+ versions = []
+ for version in metadata.find_all('version'):
+ arxiv_id = base_id + version['version']
+ release_date = version.date.string.strip()
+ release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
+ # XXX: source_type?
+ versions.append(fatcat_client.ReleaseEntity(
+ work_id=None,
+ title=title,
+ #original_title
+ version=version['version'],
+ release_type="article-journal",
+ release_stage='submitted',
+ release_date=release_date.isoformat(),
+ release_year=release_date.year,
+ ext_ids=fatcat_client.ReleaseExtIds(
+ arxiv=arxiv_id,
+ ),
+ language=lang,
+ license_slug=license_slug,
+ abstracts=abstracts,
+ contribs=contribs,
+ extra=extra,
+ ))
+ # TODO: assert that versions are actually in order
+ assert versions
+
+ # only apply DOI to most recent version (HACK)
+ if doi:
+ versions[-1].ext_ids.doi = doi
+ versions[-1].release_stage = "published"
+ return versions
+
+ def try_update(self, versions):
+ """
+ This is pretty complex! There is no batch/bezerk mode for arxiv importer.
+
+ For each version, do a lookup by full arxiv_id, and store work/release
+ id results.
+
+ If a version has a DOI, also do a doi lookup and store that result. If
+ there is an existing release with both matching, set that as the
+ existing work. If they don't match, use the full arxiv_id match and
+ move on (maybe log or at least count the error?). If it's a
+ one/or/other case, update the existing release (and mark version as
+ existing).
+
+ If there was any existing release, take its work_id.
+
+ Iterate back through versions. If it didn't already exist, insert it
+ with any existing work_id. If there wasn't an existing work_id, lookup
+ the new release (by rev from edit?) and use that for the rest.
+
+ Do not pass any versions on for batch insert.
+ """
+
+ # first do lookups
+ any_work_id = None
+ for v in versions:
+ v._existing_work_id = None
+ v._updated = False
+ existing = None
+ existing_doi = None
+ try:
+ existing = self.api.lookup_release(arxiv=v.ext_ids.arxiv)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if v.ext_ids.doi:
+ try:
+ existing_doi = self.api.lookup_release(arxiv=v.ext_ids.arxiv)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if existing_doi:
+ if existing and existing.ident == existing_doi.ident:
+ # great, they match and have idents, nothing to do
+ pass
+ elif existing and existing.ident != existing_doi.ident:
+ # could be bad, or could be that a new arxiv version was
+ # created (update?)
+ # stick with arxiv_id match as existing; don't update anything
+ pass
+ else:
+ assert not existing
+ if not existing_doi.ext_ids.arxiv_id:
+ # update the existing DOI-based record with our full arxiv_id
+ existing_doi.ext_ids.arxiv_id = v.ext_ids.arxiv_id
+ self.api.update_release(self.get_editgroup_id(), existing_doi.ident, existing_doi)
+ self.counts['update'] += 1
+ # as a flag to not count below
+ v._updated = True
+ existing = existing_doi
+
+ v._existing_work_id = existing.work_id
+ any_work_id = existing.work_id
+
+ last_edit = None
+ for v in versions:
+ if v._existing_work_id:
+ if not v._updated:
+ self.counts['exists'] += 1
+ continue
+ if not any_work_id and last_edit:
+ # fetch the last inserted release from this group
+ r = self.api.get_release_rev(last_edit.rev)
+ assert r.work_id
+ any_work_id = r.work_id
+ v.work_id = any_work_id
+ last_edit = self.api.insert_release(self.get_editgroup_id(), v)
+ self.counts['insert'] += 1
+
+ return False
+
+ def insert_batch(self, batch_batch):
+ # there is no batch/bezerk mode for arxiv importer, except for testing
+ if self._test_override:
+ for batch in batch_batch:
+ self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+ editgroup=fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+ self.counts['insert'] += len(batch) - 1
+ else:
+ raise NotImplementedError()
+
+ def parse_file(self, handle):
+
+ # 1. open with beautiful soup
+ soup = BeautifulSoup(handle, "xml")
+
+ # 2. iterate over articles, call parse_article on each
+ for article in soup.find_all("record"):
+ resp = self.parse_record(article)
+ print(json.dumps(resp))
+ #sys.exit(-1)
+
+if __name__=='__main__':
+ parser = ArxivRawImporter()
+ parser.parse_file(open(sys.argv[1]))
diff --git a/python/parse_arxivraw_xml.py b/python/parse_arxivraw_xml.py
deleted file mode 100644
index 9b9f28c9..00000000
--- a/python/parse_arxivraw_xml.py
+++ /dev/null
@@ -1,198 +0,0 @@
-
-import sys
-import json
-import datetime
-from bs4 import BeautifulSoup
-from bs4.element import NavigableString
-from pylatexenc.latex2text import LatexNodes2Text
-
-
-latex2text = LatexNodes2Text()
-
-def parse_arxiv_authors(raw):
- if not raw:
- return []
- authors = raw.split(', ')
- if authors:
- last = authors[-1].split(" and ")
- if len(last) == 2:
- authors[-1] = last[0]
- authors.append(last[1])
- authors = [latex2text.latex_to_text(a).strip() for a in authors]
- return authors
-
-def test_parse_arxiv_authors():
-
- assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
- "Raphael Chetrite",
- "Shamik Gupta",
- "Izaak Neri",
- "Édgar Roldán",
- ]
- assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [
- "Izaak Neri",
- "Édgar Roldán",
- ]
- assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [
- "Raphael Chetrite Shamik Gupta",
- ]
-
-class ArxivRawXmlParser():
- """
- Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities
-
- TODO: this will require a special importer that keeps works together
- TODO: arxiv_id lookup in API (rust) with no version specified should select
- the "most recent" version; can be a simple sort?
- """
-
- def __init__(self):
- pass
-
- def parse_file(self, handle):
-
- # 1. open with beautiful soup
- soup = BeautifulSoup(handle, "xml")
-
- # 2. iterate over articles, call parse_article on each
- for article in soup.find_all("record"):
- resp = self.parse_record(article)
- print(json.dumps(resp))
- #sys.exit(-1)
-
-
- def parse_record(self, record):
-
- metadata = record.arXivRaw
- extra = dict()
- extra_arxiv = dict()
-
- base_id = metadata.id.string
- doi = None
- if metadata.doi and metadata.doi.string:
- doi = metadata.doi.string.lower().strip()
- assert doi.startswith('10.')
- title = latex2text.latex_to_text(metadata.title.string)
- authors = parse_arxiv_authors(metadata.authors.string)
- contribs = [dict(raw_name=a, role='author') for a in authors]
-
- lang = "en" # the vast majority in english
- if metadata.comments and metadata.comments.string:
- comments = metadata.comments.string.strip()
- extra_arxiv['comments'] = comments
- if 'in french' in comments.lower():
- lang = 'fr'
- elif 'in spanish' in comments.lower():
- lang = 'es'
- elif 'in portuguese' in comments.lower():
- lang = 'pt'
- elif 'in hindi' in comments.lower():
- lang = 'hi'
- elif 'in japanese' in comments.lower():
- lang = 'ja'
- elif 'in german' in comments.lower():
- lang = 'de'
- elif 'simplified chinese' in comments.lower():
- lang = 'zh'
- elif 'in russian' in comments.lower():
- lang = 'ru'
- # more languages?
-
- release_type = "article-journal"
-
- if metadata.find('journal-ref') and metadata.find('journal-ref').string:
- journal_ref = metadata.find('journal-ref').string.strip()
- extra_arxiv['journal_ref'] = journal_ref
- if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
- release_type = "conference-paper"
- if metadata.find('report-no') and metadata.find('report-no').string:
- extra['number'] = metadata.find('report-no').string.strip()
- release_type = "report"
- if metadata.find('acm-class') and metadata.find('acm-class').string:
- extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip()
- if metadata.categories and metadata.categories.string:
- extra_arxiv['categories'] = metadata.categories.string.split()
- license_slug = None
- if metadata.license and metadata.license.string:
- # XXX: convert URL to slug
- license_slug = metadata.license.string.strip()
- abstracts = None
- if metadata.abstract:
- # TODO: test for this multi-abstract code path
- abstracts = []
- abst = metadata.abstract.string.strip()
- orig = None
- if '-----' in abst:
- both = abst.split('-----')
- abst = both[0].strip()
- orig = both[1].strip()
- if '$' in abst or '{' in abst:
- mime = "application/x-latex"
- abst_plain = latex2text.latex_to_text(abst)
- abstracts.append(dict(content=abst_plain, mime="text/plain", lang="en"))
- else:
- mime = "text/plain"
- abstracts.append(dict(content=abst, mime=mime, lang="en"))
- if orig:
- abstracts.append(dict(content=orig, mime=mime))
-
- if extra_arxiv:
- extra['arxiv'] = extra_arxiv
- if not extra:
- extra = None
-
- versions = []
- for version in metadata.find_all('version'):
- arxiv_id = base_id + version['version']
- release_date = version.date.string.strip()
- release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
- versions.append(dict(
- work_id=None,
- title=title,
- #original_title
- release_type="article-journal",
- release_status='submitted', # XXX: source_type?
- release_date=release_date.isoformat(),
- release_year=release_date.year,
- arxiv_id=arxiv_id,
- #doi (see below)
- #pmid
- #pmcid
- #isbn13 # never in Article
- #volume
- #issue
- #pages
- #publisher
- language=lang,
- #license_slug # not in MEDLINE
-
- # content, mimetype, lang
- abstracts=abstracts,
-
- # raw_name, role, raw_affiliation, extra
- contribs=contribs,
-
- # name, type, publisher, issnl
- # extra: issnp, issne, original_name, languages, country
- #container=container, # very little/none; resolve via DOI?
-
- # extra:
- # withdrawn_date
- # translation_of
- # subtitle
- # aliases
- # container_name
- # group-title
- # pubmed: retraction refs
- extra=extra,
- ))
-
- # only apply DOI to most recent version (HACK)
- if doi:
- versions[-1]['doi'] = doi
- versions[-1]['release_status'] = "published"
- return base_id, versions
-
-if __name__=='__main__':
- parser = ArxivRawXmlParser()
- parser.parse_file(open(sys.argv[1]))
diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py
new file mode 100644
index 00000000..726bafc5
--- /dev/null
+++ b/python/tests/import_arxiv.py
@@ -0,0 +1,96 @@
+
+import json, gzip
+import pytest
+from fatcat_tools.importers import ArxivRawImporter, Bs4XmlFilePusher
+from fixtures import api
+from bs4 import BeautifulSoup
+
+
+@pytest.fixture(scope="function")
+def arxiv_importer(api):
+ ari = ArxivRawImporter(api, bezerk_mode=True)
+ ari._test_override = True
+ return ari
+
+def test_arxiv_importer(arxiv_importer):
+ last_index = arxiv_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/arxivraw_1810.09584.xml', 'r') as f:
+ arxiv_importer.bezerk_mode = True
+ counts = Bs4XmlFilePusher(arxiv_importer, f, "record").run()
+ assert counts['insert'] == 2
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
+ # fetch most recent editgroup
+ change = arxiv_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "arxiv" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.ArxivRawImporter" in eg.extra['agent']
+
+ last_index = arxiv_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/arxivraw_1810.09584.xml', 'r') as f:
+ arxiv_importer.bezerk_mode = False
+ arxiv_importer.reset()
+ counts = Bs4XmlFilePusher(arxiv_importer, f, "record").run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 2
+ assert counts['skip'] == 0
+ assert last_index == arxiv_importer.api.get_changelog(limit=1)[0].index
+
+def test_arxiv_xml_parse(arxiv_importer):
+ with open('tests/files/arxivraw_1810.09584.xml', 'r') as f:
+ soup = BeautifulSoup(f, "xml")
+ r = arxiv_importer.parse_record(soup.find_all("record")[0])
+
+ r1 = r[0]
+ r2 = r[1]
+ print(r1.extra)
+ print(r2.extra)
+ assert r1.work_id == r2.work_id
+ assert r1.title == "Martingale theory for housekeeping heat"
+ assert r1.subtitle == None
+ assert r1.original_title == None
+ assert r1.release_type == "article-journal"
+ assert r1.release_stage == "submitted"
+ assert r2.release_stage == "published"
+ assert r1.license_slug == "ARXIV-NED-1.0"
+ assert r1.version == "v1"
+ assert r2.version == "v2"
+ assert r1.ext_ids.arxiv == "1810.09584v1"
+ assert r2.ext_ids.arxiv == "1810.09584v2"
+ assert r1.ext_ids.doi == None
+ assert r2.ext_ids.doi == "10.1209/0295-5075/124/60006"
+ assert r1.release_year == 2018
+ assert str(r1.release_date) == "2018-10-22"
+ assert r2.release_year == 2019
+ assert str(r2.release_date) == "2019-01-13"
+ # matched by ISSN, so shouldn't be in there?
+ #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London"
+ assert len(r1.contribs) == 4
+ # XXX: extra['arxiv'] stuff
+
+ assert r1.contribs[0].raw_name == "Raphael Chetrite"
+ assert r1.contribs[0].role == "author"
+ assert r1.contribs[1].raw_name == "Shamik Gupta"
+ assert r1.contribs[2].raw_name == "Izaak Neri"
+ assert r1.contribs[3].raw_name == "Édgar Roldán"
+ assert r1.contribs[3].role == "author"
+
+ assert len(r1.contribs) == 4
+ assert r1.contribs == r2.contribs
+
+ assert r1.abstracts[0].content.startswith("The housekeeping heat is the energy exchanged")
+ # order isn't deterministic
+ assert "application/x-latex" in [a.mimetype for a in r1.abstracts]
+ assert "text/plain" in [a.mimetype for a in r1.abstracts]
+
+ assert r1.abstracts == r2.abstracts
+
+ assert r1.extra['arxiv']['comments'] == "7 pages, 2 figures"
+ assert r1.extra['arxiv']['categories'] == ["cond-mat.stat-mech", "physics.bio-ph", "physics.data-an"]
+
+ assert r1.extra == r2.extra
+
+ assert not r1.refs