diff options
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/jstor.py (renamed from python/parse_jstor_xml.py) | 170 | 
2 files changed, 132 insertions, 39 deletions
| diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index ecbfe38e..497946ea 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -15,6 +15,7 @@ To run an import you combine two classes; one each of:  from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP  from .jalc import JalcImporter +from .jstor import JstorImporter  from .grobid_metadata import GrobidMetadataImporter  from .journal_metadata import JournalMetadataImporter  from .matched import MatchedImporter diff --git a/python/parse_jstor_xml.py b/python/fatcat_tools/importers/jstor.py index 04f2b18e..6ab320c3 100644 --- a/python/parse_jstor_xml.py +++ b/python/fatcat_tools/importers/jstor.py @@ -1,31 +1,56 @@ +  import sys  import json +import sqlite3  import datetime +import itertools +import subprocess  from bs4 import BeautifulSoup -from bs4.element import NavigableString +import fatcat_client +from .common import EntityImporter, clean + +# is this just ISO 3-char to ISO 2-char? +# XXX: more entries +JSTOR_LANG_MAP = { +    'eng': 'en', +} -class JstorXmlParser(): +# XXX: more entries +JSTOR_CONTRIB_MAP = { +    'author': 'author', +    'editor': 'editor', +    'translator': 'translator', +    'illustrator': 'illustrator', +} + +class JstorImporter(EntityImporter):      """ -    Converts JSTOR bulk XML metadata (eg, from their Early Journals Collection) +    Importer for JSTOR bulk XML metadata (eg, from their Early Journals +    Collection)      """ -    def __init__(self): -        pass +    def __init__(self, api, issn_map_file, **kwargs): -    def parse_file(self, handle): +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of JSTOR XML metadata") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter') +        super().__init__(api, +            issn_map_file=issn_map_file, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs) -        # 1. open with beautiful soup -        soup = BeautifulSoup(handle, "xml") +        self.create_containers = kwargs.get('create_containers') -        # 2. iterate over articles, call parse_article on each -        for article in soup.find_all("article"): -            resp = self.parse_article(article) -            print(json.dumps(resp)) -            #sys.exit(-1) +        self.read_issn_map_file(issn_map_file) -    def parse_article(self, article): +    def want(self, obj): +        return True + +    def parse_record(self, article):          journal_meta = article.front.find("journal-meta")          article_meta = article.front.find("article-meta") @@ -42,16 +67,22 @@ class JstorXmlParser():                  issn = "{}-{}".format(issn[0:4], issn[4:8])              else:                  assert len(issn) == 9 +        # XXX: +        container_id = None          container = dict(              name=journal_title,              publisher=publisher,              issn=issn,   # TODO: ISSN-L lookup...          ) -        doi = article_meta.find("article-id", attr={"pub-id-type": "doi"}) +        doi = article_meta.find("article-id", {"pub-id-type": "doi"})          if doi:              doi = doi.string.lower().strip() +        jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) +        if jstor_id: +            jstor_id = jstor_id.string +          title = article_meta.find("article-title")          if title:              title = title.string.strip() @@ -70,9 +101,11 @@ class JstorXmlParser():                      name = surname.string                  else:                      name = None -                contribs.append(dict( -                    role=c['contrib-type'],   # XXX: types? mapping? -                    raw_name=name, +                contribs.append(fatcat_client.ReleaseContrib( +                    role=JSTOR_CONTRIB_MAP.get(c['contrib-type']), +                    raw_name=clean(name), +                    given_name=clean(given.string), +                    surname=clean(surname.string),                  ))          release_year = None @@ -104,9 +137,11 @@ class JstorXmlParser():          cm = article_meta.find("custom-meta")          if cm.find("meta-name").string == "lang":              language = cm.find("meta-value").string +            language = JSTOR_LANG_MAP.get(language)          release_type = "article-journal"          if "[Abstract]" in title: +            # TODO: strip the "[Abstract]" bit?              release_type = "abstract"          elif "[Editorial" in title:              release_type = "editorial" @@ -119,38 +154,53 @@ class JstorXmlParser():              # strip brackets if that is all that is there (eg, translation or non-english)              title = title[1:-1] +        # JSTOR issue-id +        if article_meta.find('issue-id'): +            issue_id = clean(article_meta.find('issue-id').string) +            if issue_id: +                extra_jstor['issue_id'] = issue_id + +        # JSTOR journal-id +        # XXX: +          # everything in JSTOR is published -        release_status = "published" +        release_stage = "published" +        # extra: +        #   withdrawn_date +        #   translation_of +        #   subtitle +        #   aliases +        #   container_name +        #   group-title +        #   pubmed: retraction refs          if extra_jstor:              extra['jstor'] = extra_jstor          if not extra:              extra = None -        re = dict( -            issn=issn, # not an entity field +        re = fatcat_client.ReleaseEntity(              #work_id              title=title,              #original_title              release_type=release_type, -            release_status=release_status, +            release_stage=release_stage,              release_date=release_date.isoformat(),              release_year=release_year, -            doi=doi, -            #pmid -            #pmcid -            #isbn13     # TODO: ? +            ext_ids=fatcat_client.ReleaseExtIds( +                doi=doi, +                jstor=jstor_id, +            ),              volume=volume,              issue=issue,              pages=pages,              publisher=publisher,              language=language, -            #license_slug   # TODO: ? +            #license_slug              # content, mimetype, lang              #abstracts=abstracts, -            # raw_name, role, raw_affiliation, extra              contribs=contribs,              # key, year, container_name, title, locator @@ -159,20 +209,62 @@ class JstorXmlParser():              #   name, type, publisher, issnl              #   extra: issnp, issne, original_name, languages, country -            container=container, - -            # extra: -            #   withdrawn_date -            #   translation_of -            #   subtitle -            #   aliases -            #   container_name -            #   group-title -            #   pubmed: retraction refs +            container_id=container_id, +              extra=extra,          )          return re +    def try_update(self, re): + +        # first, lookup existing by JSTOR id (which much be defined) +        existing = None +        try: +            existing = self.api.lookup_release(jstor=re.ext_ids.jstor) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        # then try DOI lookup if there is one +        if not existing and re.ext_ids.doi: +            try: +                existing = self.api.lookup_release(jstor=re.ext_ids.jstor) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err + +        if existing and existing.ext_ids.jstor: +            # don't update if it already has JSTOR ID +            self.counts['exists'] += 1 +            return False +        elif existing: +            # but do update if only DOI was set +            existing.ext_ids.jstor = re.jstor_id +            existing.extra['jstor'] = re.extra['jstor'] +            self.api.update_release(self.get_editgroup_id(), existing.ident, existing) +            self.counts['update'] += 1 +            return False + +        return True + +    def insert_batch(self, batch): +        self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( +            editgroup=fatcat_client.Editgroup( +                description=self.editgroup_description, +                extra=self.editgroup_extra), +            entity_list=batch)) + +    def parse_file(self, handle): + +        # 1. open with beautiful soup +        soup = BeautifulSoup(handle, "xml") + +        # 2. iterate over articles, call parse_article on each +        for article in soup.find_all("article"): +            resp = self.parse_record(article) +            print(json.dumps(resp)) +            #sys.exit(-1) +  if __name__=='__main__': -    parser = JstorXmlParser() +    parser = JstorImporter()      parser.parse_file(open(sys.argv[1])) | 
