initial arxivraw importer (from parser)

author: Bryan Newbold <bnewbold@robocracy.org> 2019-05-15 17:11:52 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-05-21 11:41:29 -0700
commit: 91879651d7aa8a18a5fbd2b57dd60c171d6c8fba (patch)
tree: df8d1fd330e41de9e0c9b0a7dcfa97a7dbe6cf02 /python/fatcat_tools
parent: 1b592132fe1a127368189e07bdbf9a16a807a284 (diff)
download: fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.tar.gz
fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.zip
2 files changed, 299 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 497946ea..8ec219f8 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -16,6 +16,7 @@ from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, Sqlit
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
 from .jalc import JalcImporter
 from .jstor import JstorImporter
+from .arxiv import ArxivRawImporter
 from .grobid_metadata import GrobidMetadataImporter
 from .journal_metadata import JournalMetadataImporter
 from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
new file mode 100644
index 00000000..c53e47f1
--- /dev/null
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -0,0 +1,298 @@
+
+import sys
+import json
+import datetime
+from bs4 import BeautifulSoup
+from pylatexenc.latex2text import LatexNodes2Text
+
+import fatcat_client
+from .common import EntityImporter, clean
+from .crossref import lookup_license_slug
+
+
+latex2text = LatexNodes2Text()
+
+def parse_arxiv_authors(raw):
+    if not raw:
+        return []
+    authors = raw.split(', ')
+    if authors:
+        last = authors[-1].split(" and ")
+        if len(last) == 2:
+            authors[-1] = last[0]
+            authors.append(last[1])
+    authors = [latex2text.latex_to_text(a).strip() for a in authors]
+    return authors
+
+def test_parse_arxiv_authors():
+
+    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
+        "Raphael Chetrite",
+        "Shamik Gupta",
+        "Izaak Neri",
+        "Édgar Roldán",
+    ]
+    assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [
+        "Izaak Neri",
+        "Édgar Roldán",
+    ]
+    assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [
+        "Raphael Chetrite Shamik Gupta",
+    ]
+
+
+class ArxivRawImporter(EntityImporter):
+    """
+    Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities
+
+    TODO: this will require a special importer that keeps works together
+    TODO: arxiv_id lookup in API (rust) with no version specified should select
+          the "most recent" version; can be a simple sort?
+    """
+
+    def __init__(self, api, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of arxiv metadata via arXivRaw OAI-PMH feed")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter')
+        # lower batch size, because multiple versions per entry (guessing 2-3 on average?)
+        batch_size = kwargs.get('edit_batch_size', 50)
+        super().__init__(api,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            batch_size=batch_size,
+            **kwargs)
+        self._test_override = False
+
+
+    def parse_record(self, record):
+
+        metadata = record.arXivRaw
+        extra = dict()
+        extra_arxiv = dict()
+
+        base_id = metadata.id.string
+        doi = None
+        if metadata.doi and metadata.doi.string:
+            doi = metadata.doi.string.lower().strip()
+            assert doi.startswith('10.')
+        title = latex2text.latex_to_text(metadata.title.string)
+        authors = parse_arxiv_authors(metadata.authors.string)
+        contribs = [fatcat_client.ReleaseContrib(raw_name=a, role='author') for a in authors]
+
+        lang = "en"     # the vast majority in english
+        if metadata.comments and metadata.comments.string:
+            comments = metadata.comments.string.strip()
+            extra_arxiv['comments'] = comments
+            if 'in french' in comments.lower():
+                lang = 'fr'
+            elif 'in spanish' in comments.lower():
+                lang = 'es'
+            elif 'in portuguese' in comments.lower():
+                lang = 'pt'
+            elif 'in hindi' in comments.lower():
+                lang = 'hi'
+            elif 'in japanese' in comments.lower():
+                lang = 'ja'
+            elif 'in german' in comments.lower():
+                lang = 'de'
+            elif 'simplified chinese' in comments.lower():
+                lang = 'zh'
+            elif 'in russian' in comments.lower():
+                lang = 'ru'
+            # more languages?
+
+        release_type = "article-journal"
+
+        if metadata.find('journal-ref') and metadata.find('journal-ref').string:
+            journal_ref = metadata.find('journal-ref').string.strip()
+            extra_arxiv['journal_ref'] = journal_ref
+            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
+                release_type = "conference-paper"
+        if metadata.find('report-no') and metadata.find('report-no').string:
+            extra['number'] = metadata.find('report-no').string.strip()
+            release_type = "report"
+        if metadata.find('acm-class') and metadata.find('acm-class').string:
+            extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip()
+        if metadata.categories and metadata.categories.string:
+            extra_arxiv['categories'] = metadata.categories.string.split()
+        license_slug = None
+        if metadata.license and metadata.license.string:
+            license_slug = lookup_license_slug(metadata.license.string)
+        abstracts = None
+        if metadata.abstract:
+            # TODO: test for this multi-abstract code path
+            abstracts = []
+            abst = metadata.abstract.string.strip()
+            orig = None
+            if '-----' in abst:
+                both = abst.split('-----')
+                abst = both[0].strip()
+                orig = both[1].strip()
+            if '$' in abst or '{' in abst:
+                mime = "application/x-latex"
+                abst_plain = latex2text.latex_to_text(abst)
+                abstracts.append(fatcat_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en"))
+            else:
+                mime = "text/plain"
+            abstracts.append(fatcat_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en"))
+            if orig:
+                abstracts.append(fatcat_client.ReleaseAbstract(content=orig, mimetype=mime))
+                # indicates that fulltext probably isn't english either
+                if lang == 'en':
+                    lang = None
+
+
+        # extra:
+        #   withdrawn_date
+        #   translation_of
+        #   subtitle
+        #   aliases
+        #   container_name
+        #   group-title
+        #   arxiv: comments, categories, etc
+        extra_arxiv['base_id'] = base_id
+        extra['arxiv'] = extra_arxiv
+
+        versions = []
+        for version in metadata.find_all('version'):
+            arxiv_id = base_id + version['version']
+            release_date = version.date.string.strip()
+            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
+            # XXX: source_type?
+            versions.append(fatcat_client.ReleaseEntity(
+                work_id=None,
+                title=title,
+                #original_title
+                version=version['version'],
+                release_type="article-journal",
+                release_stage='submitted',
+                release_date=release_date.isoformat(),
+                release_year=release_date.year,
+                ext_ids=fatcat_client.ReleaseExtIds(
+                    arxiv=arxiv_id,
+                ),
+                language=lang,
+                license_slug=license_slug,
+                abstracts=abstracts,
+                contribs=contribs,
+                extra=extra,
+            ))
+        # TODO: assert that versions are actually in order
+        assert versions
+
+        # only apply DOI to most recent version (HACK)
+        if doi:
+            versions[-1].ext_ids.doi = doi
+            versions[-1].release_stage = "published"
+        return versions
+
+    def try_update(self, versions):
+        """
+        This is pretty complex! There is no batch/bezerk mode for arxiv importer.
+
+        For each version, do a lookup by full arxiv_id, and store work/release
+        id results.
+        
+        If a version has a DOI, also do a doi lookup and store that result. If
+        there is an existing release with both matching, set that as the
+        existing work. If they don't match, use the full arxiv_id match and
+        move on (maybe log or at least count the error?). If it's a
+        one/or/other case, update the existing release (and mark version as
+        existing).
+
+        If there was any existing release, take its work_id.
+
+        Iterate back through versions. If it didn't already exist, insert it
+        with any existing work_id. If there wasn't an existing work_id, lookup
+        the new release (by rev from edit?) and use that for the rest.
+
+        Do not pass any versions on for batch insert.
+        """
+
+        # first do lookups
+        any_work_id = None
+        for v in versions:
+            v._existing_work_id = None
+            v._updated = False
+            existing = None
+            existing_doi = None
+            try:
+                existing = self.api.lookup_release(arxiv=v.ext_ids.arxiv)
+            except fatcat_client.rest.ApiException as err:
+                if err.status != 404:
+                    raise err
+            if v.ext_ids.doi:
+                try:
+                    existing_doi = self.api.lookup_release(arxiv=v.ext_ids.arxiv)
+                except fatcat_client.rest.ApiException as err:
+                    if err.status != 404:
+                        raise err
+            if existing_doi:
+                if existing and existing.ident == existing_doi.ident:
+                    # great, they match and have idents, nothing to do
+                    pass
+                elif existing and existing.ident != existing_doi.ident:
+                    # could be bad, or could be that a new arxiv version was
+                    # created (update?)
+                    # stick with arxiv_id match as existing; don't update anything
+                    pass
+                else:
+                    assert not existing
+                    if not existing_doi.ext_ids.arxiv_id:
+                        # update the existing DOI-based record with our full arxiv_id
+                        existing_doi.ext_ids.arxiv_id = v.ext_ids.arxiv_id
+                        self.api.update_release(self.get_editgroup_id(), existing_doi.ident, existing_doi)
+                        self.counts['update'] += 1
+                        # as a flag to not count below
+                        v._updated = True
+                    existing = existing_doi
+            
+            v._existing_work_id = existing.work_id
+            any_work_id = existing.work_id
+
+        last_edit = None
+        for v in versions:
+            if v._existing_work_id:
+                if not v._updated:
+                    self.counts['exists'] += 1
+                continue
+            if not any_work_id and last_edit:
+                # fetch the last inserted release from this group
+                r = self.api.get_release_rev(last_edit.rev)
+                assert r.work_id
+                any_work_id = r.work_id
+            v.work_id = any_work_id
+            last_edit = self.api.insert_release(self.get_editgroup_id(), v)
+            self.counts['insert'] += 1
+
+        return False
+
+    def insert_batch(self, batch_batch):
+        # there is no batch/bezerk mode for arxiv importer, except for testing
+        if self._test_override:
+            for batch in batch_batch:
+                self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+                    editgroup=fatcat_client.Editgroup(
+                        description=self.editgroup_description,
+                        extra=self.editgroup_extra),
+                    entity_list=batch))
+                self.counts['insert'] += len(batch) - 1
+        else:
+            raise NotImplementedError()
+
+    def parse_file(self, handle):
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for article in soup.find_all("record"):
+            resp = self.parse_record(article)
+            print(json.dumps(resp))
+            #sys.exit(-1)
+
+if __name__=='__main__':
+    parser = ArxivRawImporter()
+    parser.parse_file(open(sys.argv[1]))
author	Bryan Newbold <bnewbold@robocracy.org>	2019-05-15 17:11:52 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-05-21 11:41:29 -0700
commit	91879651d7aa8a18a5fbd2b57dd60c171d6c8fba (patch)
tree	df8d1fd330e41de9e0c9b0a7dcfa97a7dbe6cf02 /python/fatcat_tools
parent	1b592132fe1a127368189e07bdbf9a16a807a284 (diff)
download	fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.tar.gz fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.zip