diff options
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 298 | 
2 files changed, 299 insertions, 0 deletions
| diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 497946ea..8ec219f8 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -16,6 +16,7 @@ from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, Sqlit  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP  from .jalc import JalcImporter  from .jstor import JstorImporter +from .arxiv import ArxivRawImporter  from .grobid_metadata import GrobidMetadataImporter  from .journal_metadata import JournalMetadataImporter  from .matched import MatchedImporter diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py new file mode 100644 index 00000000..c53e47f1 --- /dev/null +++ b/python/fatcat_tools/importers/arxiv.py @@ -0,0 +1,298 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from pylatexenc.latex2text import LatexNodes2Text + +import fatcat_client +from .common import EntityImporter, clean +from .crossref import lookup_license_slug + + +latex2text = LatexNodes2Text() + +def parse_arxiv_authors(raw): +    if not raw: +        return [] +    authors = raw.split(', ') +    if authors: +        last = authors[-1].split(" and ") +        if len(last) == 2: +            authors[-1] = last[0] +            authors.append(last[1]) +    authors = [latex2text.latex_to_text(a).strip() for a in authors] +    return authors + +def test_parse_arxiv_authors(): + +    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ +        "Raphael Chetrite", +        "Shamik Gupta", +        "Izaak Neri", +        "Édgar Roldán", +    ] +    assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [ +        "Izaak Neri", +        "Édgar Roldán", +    ] +    assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ +        "Raphael Chetrite Shamik Gupta", +    ] + + +class ArxivRawImporter(EntityImporter): +    """ +    Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities + +    TODO: this will require a special importer that keeps works together +    TODO: arxiv_id lookup in API (rust) with no version specified should select +          the "most recent" version; can be a simple sort? +    """ + +    def __init__(self, api, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of arxiv metadata via arXivRaw OAI-PMH feed") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter') +        # lower batch size, because multiple versions per entry (guessing 2-3 on average?) +        batch_size = kwargs.get('edit_batch_size', 50) +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            batch_size=batch_size, +            **kwargs) +        self._test_override = False + + +    def parse_record(self, record): + +        metadata = record.arXivRaw +        extra = dict() +        extra_arxiv = dict() + +        base_id = metadata.id.string +        doi = None +        if metadata.doi and metadata.doi.string: +            doi = metadata.doi.string.lower().strip() +            assert doi.startswith('10.') +        title = latex2text.latex_to_text(metadata.title.string) +        authors = parse_arxiv_authors(metadata.authors.string) +        contribs = [fatcat_client.ReleaseContrib(raw_name=a, role='author') for a in authors] + +        lang = "en"     # the vast majority in english +        if metadata.comments and metadata.comments.string: +            comments = metadata.comments.string.strip() +            extra_arxiv['comments'] = comments +            if 'in french' in comments.lower(): +                lang = 'fr' +            elif 'in spanish' in comments.lower(): +                lang = 'es' +            elif 'in portuguese' in comments.lower(): +                lang = 'pt' +            elif 'in hindi' in comments.lower(): +                lang = 'hi' +            elif 'in japanese' in comments.lower(): +                lang = 'ja' +            elif 'in german' in comments.lower(): +                lang = 'de' +            elif 'simplified chinese' in comments.lower(): +                lang = 'zh' +            elif 'in russian' in comments.lower(): +                lang = 'ru' +            # more languages? + +        release_type = "article-journal" + +        if metadata.find('journal-ref') and metadata.find('journal-ref').string: +            journal_ref = metadata.find('journal-ref').string.strip() +            extra_arxiv['journal_ref'] = journal_ref +            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): +                release_type = "conference-paper" +        if metadata.find('report-no') and metadata.find('report-no').string: +            extra['number'] = metadata.find('report-no').string.strip() +            release_type = "report" +        if metadata.find('acm-class') and metadata.find('acm-class').string: +            extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() +        if metadata.categories and metadata.categories.string: +            extra_arxiv['categories'] = metadata.categories.string.split() +        license_slug = None +        if metadata.license and metadata.license.string: +            license_slug = lookup_license_slug(metadata.license.string) +        abstracts = None +        if metadata.abstract: +            # TODO: test for this multi-abstract code path +            abstracts = [] +            abst = metadata.abstract.string.strip() +            orig = None +            if '-----' in abst: +                both = abst.split('-----') +                abst = both[0].strip() +                orig = both[1].strip() +            if '$' in abst or '{' in abst: +                mime = "application/x-latex" +                abst_plain = latex2text.latex_to_text(abst) +                abstracts.append(fatcat_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en")) +            else: +                mime = "text/plain" +            abstracts.append(fatcat_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) +            if orig: +                abstracts.append(fatcat_client.ReleaseAbstract(content=orig, mimetype=mime)) +                # indicates that fulltext probably isn't english either +                if lang == 'en': +                    lang = None + + +        # extra: +        #   withdrawn_date +        #   translation_of +        #   subtitle +        #   aliases +        #   container_name +        #   group-title +        #   arxiv: comments, categories, etc +        extra_arxiv['base_id'] = base_id +        extra['arxiv'] = extra_arxiv + +        versions = [] +        for version in metadata.find_all('version'): +            arxiv_id = base_id + version['version'] +            release_date = version.date.string.strip() +            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() +            # XXX: source_type? +            versions.append(fatcat_client.ReleaseEntity( +                work_id=None, +                title=title, +                #original_title +                version=version['version'], +                release_type="article-journal", +                release_stage='submitted', +                release_date=release_date.isoformat(), +                release_year=release_date.year, +                ext_ids=fatcat_client.ReleaseExtIds( +                    arxiv=arxiv_id, +                ), +                language=lang, +                license_slug=license_slug, +                abstracts=abstracts, +                contribs=contribs, +                extra=extra, +            )) +        # TODO: assert that versions are actually in order +        assert versions + +        # only apply DOI to most recent version (HACK) +        if doi: +            versions[-1].ext_ids.doi = doi +            versions[-1].release_stage = "published" +        return versions + +    def try_update(self, versions): +        """ +        This is pretty complex! There is no batch/bezerk mode for arxiv importer. + +        For each version, do a lookup by full arxiv_id, and store work/release +        id results. +         +        If a version has a DOI, also do a doi lookup and store that result. If +        there is an existing release with both matching, set that as the +        existing work. If they don't match, use the full arxiv_id match and +        move on (maybe log or at least count the error?). If it's a +        one/or/other case, update the existing release (and mark version as +        existing). + +        If there was any existing release, take its work_id. + +        Iterate back through versions. If it didn't already exist, insert it +        with any existing work_id. If there wasn't an existing work_id, lookup +        the new release (by rev from edit?) and use that for the rest. + +        Do not pass any versions on for batch insert. +        """ + +        # first do lookups +        any_work_id = None +        for v in versions: +            v._existing_work_id = None +            v._updated = False +            existing = None +            existing_doi = None +            try: +                existing = self.api.lookup_release(arxiv=v.ext_ids.arxiv) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err +            if v.ext_ids.doi: +                try: +                    existing_doi = self.api.lookup_release(arxiv=v.ext_ids.arxiv) +                except fatcat_client.rest.ApiException as err: +                    if err.status != 404: +                        raise err +            if existing_doi: +                if existing and existing.ident == existing_doi.ident: +                    # great, they match and have idents, nothing to do +                    pass +                elif existing and existing.ident != existing_doi.ident: +                    # could be bad, or could be that a new arxiv version was +                    # created (update?) +                    # stick with arxiv_id match as existing; don't update anything +                    pass +                else: +                    assert not existing +                    if not existing_doi.ext_ids.arxiv_id: +                        # update the existing DOI-based record with our full arxiv_id +                        existing_doi.ext_ids.arxiv_id = v.ext_ids.arxiv_id +                        self.api.update_release(self.get_editgroup_id(), existing_doi.ident, existing_doi) +                        self.counts['update'] += 1 +                        # as a flag to not count below +                        v._updated = True +                    existing = existing_doi +             +            v._existing_work_id = existing.work_id +            any_work_id = existing.work_id + +        last_edit = None +        for v in versions: +            if v._existing_work_id: +                if not v._updated: +                    self.counts['exists'] += 1 +                continue +            if not any_work_id and last_edit: +                # fetch the last inserted release from this group +                r = self.api.get_release_rev(last_edit.rev) +                assert r.work_id +                any_work_id = r.work_id +            v.work_id = any_work_id +            last_edit = self.api.insert_release(self.get_editgroup_id(), v) +            self.counts['insert'] += 1 + +        return False + +    def insert_batch(self, batch_batch): +        # there is no batch/bezerk mode for arxiv importer, except for testing +        if self._test_override: +            for batch in batch_batch: +                self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( +                    editgroup=fatcat_client.Editgroup( +                        description=self.editgroup_description, +                        extra=self.editgroup_extra), +                    entity_list=batch)) +                self.counts['insert'] += len(batch) - 1 +        else: +            raise NotImplementedError() + +    def parse_file(self, handle): + +        # 1. open with beautiful soup +        soup = BeautifulSoup(handle, "xml") + +        # 2. iterate over articles, call parse_article on each +        for article in soup.find_all("record"): +            resp = self.parse_record(article) +            print(json.dumps(resp)) +            #sys.exit(-1) + +if __name__=='__main__': +    parser = ArxivRawImporter() +    parser.parse_file(open(sys.argv[1])) | 
