diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-15 17:11:52 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 | 
| commit | 91879651d7aa8a18a5fbd2b57dd60c171d6c8fba (patch) | |
| tree | df8d1fd330e41de9e0c9b0a7dcfa97a7dbe6cf02 /python | |
| parent | 1b592132fe1a127368189e07bdbf9a16a807a284 (diff) | |
| download | fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.tar.gz fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.zip | |
initial arxivraw importer (from parser)
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 298 | ||||
| -rw-r--r-- | python/parse_arxivraw_xml.py | 198 | ||||
| -rw-r--r-- | python/tests/import_arxiv.py | 96 | 
4 files changed, 395 insertions, 198 deletions
| diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 497946ea..8ec219f8 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -16,6 +16,7 @@ from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, Sqlit  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP  from .jalc import JalcImporter  from .jstor import JstorImporter +from .arxiv import ArxivRawImporter  from .grobid_metadata import GrobidMetadataImporter  from .journal_metadata import JournalMetadataImporter  from .matched import MatchedImporter diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py new file mode 100644 index 00000000..c53e47f1 --- /dev/null +++ b/python/fatcat_tools/importers/arxiv.py @@ -0,0 +1,298 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from pylatexenc.latex2text import LatexNodes2Text + +import fatcat_client +from .common import EntityImporter, clean +from .crossref import lookup_license_slug + + +latex2text = LatexNodes2Text() + +def parse_arxiv_authors(raw): +    if not raw: +        return [] +    authors = raw.split(', ') +    if authors: +        last = authors[-1].split(" and ") +        if len(last) == 2: +            authors[-1] = last[0] +            authors.append(last[1]) +    authors = [latex2text.latex_to_text(a).strip() for a in authors] +    return authors + +def test_parse_arxiv_authors(): + +    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ +        "Raphael Chetrite", +        "Shamik Gupta", +        "Izaak Neri", +        "Édgar Roldán", +    ] +    assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [ +        "Izaak Neri", +        "Édgar Roldán", +    ] +    assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ +        "Raphael Chetrite Shamik Gupta", +    ] + + +class ArxivRawImporter(EntityImporter): +    """ +    Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities + +    TODO: this will require a special importer that keeps works together +    TODO: arxiv_id lookup in API (rust) with no version specified should select +          the "most recent" version; can be a simple sort? +    """ + +    def __init__(self, api, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of arxiv metadata via arXivRaw OAI-PMH feed") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter') +        # lower batch size, because multiple versions per entry (guessing 2-3 on average?) +        batch_size = kwargs.get('edit_batch_size', 50) +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            batch_size=batch_size, +            **kwargs) +        self._test_override = False + + +    def parse_record(self, record): + +        metadata = record.arXivRaw +        extra = dict() +        extra_arxiv = dict() + +        base_id = metadata.id.string +        doi = None +        if metadata.doi and metadata.doi.string: +            doi = metadata.doi.string.lower().strip() +            assert doi.startswith('10.') +        title = latex2text.latex_to_text(metadata.title.string) +        authors = parse_arxiv_authors(metadata.authors.string) +        contribs = [fatcat_client.ReleaseContrib(raw_name=a, role='author') for a in authors] + +        lang = "en"     # the vast majority in english +        if metadata.comments and metadata.comments.string: +            comments = metadata.comments.string.strip() +            extra_arxiv['comments'] = comments +            if 'in french' in comments.lower(): +                lang = 'fr' +            elif 'in spanish' in comments.lower(): +                lang = 'es' +            elif 'in portuguese' in comments.lower(): +                lang = 'pt' +            elif 'in hindi' in comments.lower(): +                lang = 'hi' +            elif 'in japanese' in comments.lower(): +                lang = 'ja' +            elif 'in german' in comments.lower(): +                lang = 'de' +            elif 'simplified chinese' in comments.lower(): +                lang = 'zh' +            elif 'in russian' in comments.lower(): +                lang = 'ru' +            # more languages? + +        release_type = "article-journal" + +        if metadata.find('journal-ref') and metadata.find('journal-ref').string: +            journal_ref = metadata.find('journal-ref').string.strip() +            extra_arxiv['journal_ref'] = journal_ref +            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): +                release_type = "conference-paper" +        if metadata.find('report-no') and metadata.find('report-no').string: +            extra['number'] = metadata.find('report-no').string.strip() +            release_type = "report" +        if metadata.find('acm-class') and metadata.find('acm-class').string: +            extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() +        if metadata.categories and metadata.categories.string: +            extra_arxiv['categories'] = metadata.categories.string.split() +        license_slug = None +        if metadata.license and metadata.license.string: +            license_slug = lookup_license_slug(metadata.license.string) +        abstracts = None +        if metadata.abstract: +            # TODO: test for this multi-abstract code path +            abstracts = [] +            abst = metadata.abstract.string.strip() +            orig = None +            if '-----' in abst: +                both = abst.split('-----') +                abst = both[0].strip() +                orig = both[1].strip() +            if '$' in abst or '{' in abst: +                mime = "application/x-latex" +                abst_plain = latex2text.latex_to_text(abst) +                abstracts.append(fatcat_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en")) +            else: +                mime = "text/plain" +            abstracts.append(fatcat_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) +            if orig: +                abstracts.append(fatcat_client.ReleaseAbstract(content=orig, mimetype=mime)) +                # indicates that fulltext probably isn't english either +                if lang == 'en': +                    lang = None + + +        # extra: +        #   withdrawn_date +        #   translation_of +        #   subtitle +        #   aliases +        #   container_name +        #   group-title +        #   arxiv: comments, categories, etc +        extra_arxiv['base_id'] = base_id +        extra['arxiv'] = extra_arxiv + +        versions = [] +        for version in metadata.find_all('version'): +            arxiv_id = base_id + version['version'] +            release_date = version.date.string.strip() +            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() +            # XXX: source_type? +            versions.append(fatcat_client.ReleaseEntity( +                work_id=None, +                title=title, +                #original_title +                version=version['version'], +                release_type="article-journal", +                release_stage='submitted', +                release_date=release_date.isoformat(), +                release_year=release_date.year, +                ext_ids=fatcat_client.ReleaseExtIds( +                    arxiv=arxiv_id, +                ), +                language=lang, +                license_slug=license_slug, +                abstracts=abstracts, +                contribs=contribs, +                extra=extra, +            )) +        # TODO: assert that versions are actually in order +        assert versions + +        # only apply DOI to most recent version (HACK) +        if doi: +            versions[-1].ext_ids.doi = doi +            versions[-1].release_stage = "published" +        return versions + +    def try_update(self, versions): +        """ +        This is pretty complex! There is no batch/bezerk mode for arxiv importer. + +        For each version, do a lookup by full arxiv_id, and store work/release +        id results. +         +        If a version has a DOI, also do a doi lookup and store that result. If +        there is an existing release with both matching, set that as the +        existing work. If they don't match, use the full arxiv_id match and +        move on (maybe log or at least count the error?). If it's a +        one/or/other case, update the existing release (and mark version as +        existing). + +        If there was any existing release, take its work_id. + +        Iterate back through versions. If it didn't already exist, insert it +        with any existing work_id. If there wasn't an existing work_id, lookup +        the new release (by rev from edit?) and use that for the rest. + +        Do not pass any versions on for batch insert. +        """ + +        # first do lookups +        any_work_id = None +        for v in versions: +            v._existing_work_id = None +            v._updated = False +            existing = None +            existing_doi = None +            try: +                existing = self.api.lookup_release(arxiv=v.ext_ids.arxiv) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err +            if v.ext_ids.doi: +                try: +                    existing_doi = self.api.lookup_release(arxiv=v.ext_ids.arxiv) +                except fatcat_client.rest.ApiException as err: +                    if err.status != 404: +                        raise err +            if existing_doi: +                if existing and existing.ident == existing_doi.ident: +                    # great, they match and have idents, nothing to do +                    pass +                elif existing and existing.ident != existing_doi.ident: +                    # could be bad, or could be that a new arxiv version was +                    # created (update?) +                    # stick with arxiv_id match as existing; don't update anything +                    pass +                else: +                    assert not existing +                    if not existing_doi.ext_ids.arxiv_id: +                        # update the existing DOI-based record with our full arxiv_id +                        existing_doi.ext_ids.arxiv_id = v.ext_ids.arxiv_id +                        self.api.update_release(self.get_editgroup_id(), existing_doi.ident, existing_doi) +                        self.counts['update'] += 1 +                        # as a flag to not count below +                        v._updated = True +                    existing = existing_doi +             +            v._existing_work_id = existing.work_id +            any_work_id = existing.work_id + +        last_edit = None +        for v in versions: +            if v._existing_work_id: +                if not v._updated: +                    self.counts['exists'] += 1 +                continue +            if not any_work_id and last_edit: +                # fetch the last inserted release from this group +                r = self.api.get_release_rev(last_edit.rev) +                assert r.work_id +                any_work_id = r.work_id +            v.work_id = any_work_id +            last_edit = self.api.insert_release(self.get_editgroup_id(), v) +            self.counts['insert'] += 1 + +        return False + +    def insert_batch(self, batch_batch): +        # there is no batch/bezerk mode for arxiv importer, except for testing +        if self._test_override: +            for batch in batch_batch: +                self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( +                    editgroup=fatcat_client.Editgroup( +                        description=self.editgroup_description, +                        extra=self.editgroup_extra), +                    entity_list=batch)) +                self.counts['insert'] += len(batch) - 1 +        else: +            raise NotImplementedError() + +    def parse_file(self, handle): + +        # 1. open with beautiful soup +        soup = BeautifulSoup(handle, "xml") + +        # 2. iterate over articles, call parse_article on each +        for article in soup.find_all("record"): +            resp = self.parse_record(article) +            print(json.dumps(resp)) +            #sys.exit(-1) + +if __name__=='__main__': +    parser = ArxivRawImporter() +    parser.parse_file(open(sys.argv[1])) diff --git a/python/parse_arxivraw_xml.py b/python/parse_arxivraw_xml.py deleted file mode 100644 index 9b9f28c9..00000000 --- a/python/parse_arxivraw_xml.py +++ /dev/null @@ -1,198 +0,0 @@ - -import sys -import json -import datetime -from bs4 import BeautifulSoup -from bs4.element import NavigableString -from pylatexenc.latex2text import LatexNodes2Text - - -latex2text = LatexNodes2Text() - -def parse_arxiv_authors(raw): -    if not raw: -        return [] -    authors = raw.split(', ') -    if authors: -        last = authors[-1].split(" and ") -        if len(last) == 2: -            authors[-1] = last[0] -            authors.append(last[1]) -    authors = [latex2text.latex_to_text(a).strip() for a in authors] -    return authors - -def test_parse_arxiv_authors(): - -    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ -        "Raphael Chetrite", -        "Shamik Gupta", -        "Izaak Neri", -        "Édgar Roldán", -    ] -    assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [ -        "Izaak Neri", -        "Édgar Roldán", -    ] -    assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ -        "Raphael Chetrite Shamik Gupta", -    ] - -class ArxivRawXmlParser(): -    """ -    Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities - -    TODO: this will require a special importer that keeps works together -    TODO: arxiv_id lookup in API (rust) with no version specified should select -          the "most recent" version; can be a simple sort? -    """ - -    def __init__(self): -        pass - -    def parse_file(self, handle): - -        # 1. open with beautiful soup -        soup = BeautifulSoup(handle, "xml") - -        # 2. iterate over articles, call parse_article on each -        for article in soup.find_all("record"): -            resp = self.parse_record(article) -            print(json.dumps(resp)) -            #sys.exit(-1) - - -    def parse_record(self, record): - -        metadata = record.arXivRaw -        extra = dict() -        extra_arxiv = dict() - -        base_id = metadata.id.string -        doi = None -        if metadata.doi and metadata.doi.string: -            doi = metadata.doi.string.lower().strip() -            assert doi.startswith('10.') -        title = latex2text.latex_to_text(metadata.title.string) -        authors = parse_arxiv_authors(metadata.authors.string) -        contribs = [dict(raw_name=a, role='author') for a in authors] - -        lang = "en"     # the vast majority in english -        if metadata.comments and metadata.comments.string: -            comments = metadata.comments.string.strip() -            extra_arxiv['comments'] = comments -            if 'in french' in comments.lower(): -                lang = 'fr' -            elif 'in spanish' in comments.lower(): -                lang = 'es' -            elif 'in portuguese' in comments.lower(): -                lang = 'pt' -            elif 'in hindi' in comments.lower(): -                lang = 'hi' -            elif 'in japanese' in comments.lower(): -                lang = 'ja' -            elif 'in german' in comments.lower(): -                lang = 'de' -            elif 'simplified chinese' in comments.lower(): -                lang = 'zh' -            elif 'in russian' in comments.lower(): -                lang = 'ru' -            # more languages? - -        release_type = "article-journal" - -        if metadata.find('journal-ref') and metadata.find('journal-ref').string: -            journal_ref = metadata.find('journal-ref').string.strip() -            extra_arxiv['journal_ref'] = journal_ref -            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): -                release_type = "conference-paper" -        if metadata.find('report-no') and metadata.find('report-no').string: -            extra['number'] = metadata.find('report-no').string.strip() -            release_type = "report" -        if metadata.find('acm-class') and metadata.find('acm-class').string: -            extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() -        if metadata.categories and metadata.categories.string: -            extra_arxiv['categories'] = metadata.categories.string.split() -        license_slug = None -        if metadata.license and metadata.license.string: -            # XXX: convert URL to slug -            license_slug = metadata.license.string.strip() -        abstracts = None -        if metadata.abstract: -            # TODO: test for this multi-abstract code path -            abstracts = [] -            abst = metadata.abstract.string.strip() -            orig = None -            if '-----' in abst: -                both = abst.split('-----') -                abst = both[0].strip() -                orig = both[1].strip() -            if '$' in abst or '{' in abst: -                mime = "application/x-latex" -                abst_plain = latex2text.latex_to_text(abst) -                abstracts.append(dict(content=abst_plain, mime="text/plain", lang="en")) -            else: -                mime = "text/plain" -            abstracts.append(dict(content=abst, mime=mime, lang="en")) -            if orig: -                abstracts.append(dict(content=orig, mime=mime)) - -        if extra_arxiv: -            extra['arxiv'] = extra_arxiv -        if not extra: -            extra = None - -        versions = [] -        for version in metadata.find_all('version'): -            arxiv_id = base_id + version['version'] -            release_date = version.date.string.strip() -            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() -            versions.append(dict( -                work_id=None, -                title=title, -                #original_title -                release_type="article-journal", -                release_status='submitted', # XXX: source_type? -                release_date=release_date.isoformat(), -                release_year=release_date.year, -                arxiv_id=arxiv_id, -                #doi (see below) -                #pmid -                #pmcid -                #isbn13     # never in Article -                #volume -                #issue -                #pages -                #publisher -                language=lang, -                #license_slug   # not in MEDLINE - -                # content, mimetype, lang -                abstracts=abstracts, - -                # raw_name, role, raw_affiliation, extra -                contribs=contribs, - -                #   name, type, publisher, issnl -                #   extra: issnp, issne, original_name, languages, country -                #container=container,   # very little/none; resolve via DOI? - -                # extra: -                #   withdrawn_date -                #   translation_of -                #   subtitle -                #   aliases -                #   container_name -                #   group-title -                #   pubmed: retraction refs -                extra=extra, -            )) - -        # only apply DOI to most recent version (HACK) -        if doi: -            versions[-1]['doi'] = doi -            versions[-1]['release_status'] = "published" -        return base_id, versions - -if __name__=='__main__': -    parser = ArxivRawXmlParser() -    parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py new file mode 100644 index 00000000..726bafc5 --- /dev/null +++ b/python/tests/import_arxiv.py @@ -0,0 +1,96 @@ + +import json, gzip +import pytest +from fatcat_tools.importers import ArxivRawImporter, Bs4XmlFilePusher +from fixtures import api +from bs4 import BeautifulSoup + + +@pytest.fixture(scope="function") +def arxiv_importer(api): +    ari = ArxivRawImporter(api, bezerk_mode=True) +    ari._test_override = True +    return ari + +def test_arxiv_importer(arxiv_importer): +    last_index = arxiv_importer.api.get_changelog(limit=1)[0].index +    with open('tests/files/arxivraw_1810.09584.xml', 'r') as f: +        arxiv_importer.bezerk_mode = True +        counts = Bs4XmlFilePusher(arxiv_importer, f, "record").run() +    assert counts['insert'] == 2 +    assert counts['exists'] == 0 +    assert counts['skip'] == 0 + +    # fetch most recent editgroup +    change = arxiv_importer.api.get_changelog_entry(index=last_index+1) +    eg = change.editgroup +    assert eg.description +    assert "arxiv" in eg.description.lower() +    assert eg.extra['git_rev'] +    assert "fatcat_tools.ArxivRawImporter" in eg.extra['agent'] + +    last_index = arxiv_importer.api.get_changelog(limit=1)[0].index +    with open('tests/files/arxivraw_1810.09584.xml', 'r') as f: +        arxiv_importer.bezerk_mode = False +        arxiv_importer.reset() +        counts = Bs4XmlFilePusher(arxiv_importer, f, "record").run() +    assert counts['insert'] == 0 +    assert counts['exists'] == 2 +    assert counts['skip'] == 0 +    assert last_index == arxiv_importer.api.get_changelog(limit=1)[0].index + +def test_arxiv_xml_parse(arxiv_importer): +    with open('tests/files/arxivraw_1810.09584.xml', 'r') as f: +        soup = BeautifulSoup(f, "xml") +        r = arxiv_importer.parse_record(soup.find_all("record")[0]) + +    r1 = r[0] +    r2 = r[1] +    print(r1.extra) +    print(r2.extra) +    assert r1.work_id == r2.work_id +    assert r1.title == "Martingale theory for housekeeping heat" +    assert r1.subtitle == None +    assert r1.original_title == None +    assert r1.release_type == "article-journal" +    assert r1.release_stage == "submitted" +    assert r2.release_stage == "published" +    assert r1.license_slug == "ARXIV-NED-1.0" +    assert r1.version == "v1" +    assert r2.version == "v2" +    assert r1.ext_ids.arxiv == "1810.09584v1" +    assert r2.ext_ids.arxiv == "1810.09584v2" +    assert r1.ext_ids.doi == None +    assert r2.ext_ids.doi == "10.1209/0295-5075/124/60006" +    assert r1.release_year == 2018 +    assert str(r1.release_date) == "2018-10-22" +    assert r2.release_year == 2019 +    assert str(r2.release_date) == "2019-01-13" +    # matched by ISSN, so shouldn't be in there? +    #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London" +    assert len(r1.contribs) == 4 +    # XXX: extra['arxiv'] stuff + +    assert r1.contribs[0].raw_name == "Raphael Chetrite" +    assert r1.contribs[0].role == "author" +    assert r1.contribs[1].raw_name == "Shamik Gupta" +    assert r1.contribs[2].raw_name == "Izaak Neri" +    assert r1.contribs[3].raw_name == "Édgar Roldán" +    assert r1.contribs[3].role == "author" + +    assert len(r1.contribs) == 4 +    assert r1.contribs == r2.contribs + +    assert r1.abstracts[0].content.startswith("The housekeeping heat is the energy exchanged") +    # order isn't deterministic +    assert "application/x-latex" in [a.mimetype for a in r1.abstracts] +    assert "text/plain" in [a.mimetype for a in r1.abstracts] + +    assert r1.abstracts == r2.abstracts + +    assert r1.extra['arxiv']['comments'] == "7 pages, 2 figures" +    assert r1.extra['arxiv']['categories'] == ["cond-mat.stat-mech", "physics.bio-ph", "physics.data-an"] + +    assert r1.extra == r2.extra + +    assert not r1.refs | 
