diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-22 12:21:07 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-22 12:21:07 -0700 | 
| commit | 77abede313c97eefc5cef16dabc3213df7000b16 (patch) | |
| tree | e7218a38f72285af1953fdd23e39c467f5464b2c | |
| parent | d33c8cf05e3c9732b04f56cf356180b9d76e04e0 (diff) | |
| download | fatcat-77abede313c97eefc5cef16dabc3213df7000b16.tar.gz fatcat-77abede313c97eefc5cef16dabc3213df7000b16.zip | |
JSTOR importer polish
| -rw-r--r-- | python/fatcat_tools/importers/jstor.py | 69 | 
1 files changed, 51 insertions, 18 deletions
| diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index c8a7b20a..c846cbde 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup  import fatcat_client  from .common import EntityImporter, clean, LANG_MAP_MARC +from .crossref import CONTAINER_TYPE_MAP  # TODO: more entries?  JSTOR_CONTRIB_MAP = { @@ -16,6 +17,14 @@ JSTOR_CONTRIB_MAP = {      'illustrator': 'illustrator',  } +JSTOR_TYPE_MAP = { +    "book-review": "review-book", +    "editorial": "editorial", +    "misc": "stub", +    "news": "article", +    "research-article": "article-journal", +} +  class JstorImporter(EntityImporter):      """      Importer for JSTOR bulk XML metadata (eg, from their Early Journals @@ -38,6 +47,9 @@ class JstorImporter(EntityImporter):          self.read_issn_map_file(issn_map_file) +    def map_container_type(self, crossref_type): +        return CONTAINER_TYPE_MAP.get(crossref_type) +      def want(self, obj):          return True @@ -49,13 +61,22 @@ class JstorImporter(EntityImporter):          extra = dict()          extra_jstor = dict() +        release_type = JSTOR_TYPE_MAP.get(article['article-type'])          title = article_meta.find("article-title") -        if title: +        if title and title.string:              title = title.string.strip() -            if title.endswith('.'): -                title = title[:-1] +        elif title and not title.string: +            title = None + +        if not title and release_type.startswith('review') and article_meta.product.source: +            title = "Review: {}".format(article_meta.product.source.string) + +        if not title: +            return None + +        if title.endswith('.'): +            title = title[:-1] -        release_type = "article-journal"          if "[Abstract]" in title:              # TODO: strip the "[Abstract]" bit?              release_type = "abstract" @@ -108,28 +129,40 @@ class JstorImporter(EntityImporter):          jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})          if jstor_id: -            jstor_id = jstor_id.string +            jstor_id = jstor_id.string.strip() +        if not jstor_id and doi: +            assert doi.startswith('10.2307/') +            jstor_id = doi.replace('10.2307/', '') +        assert jstor_id and int(jstor_id)          contribs = []          cgroup = article_meta.find("contrib-group")          if cgroup:              for c in cgroup.find_all("contrib"):                  given = c.find("given-names") +                if given: +                    given = clean(given.string)                  surname = c.find("surname") -                if given and surname: -                    name = "{} {}".format(given.string, surname.string) -                elif surname: -                    name = surname.string -                else: -                    name = None -                role = JSTOR_CONTRIB_MAP.get(c['contrib-type']) -                if not role and c['contrib-type']: +                if surname: +                    surname = clean(surname.string) +                raw_name = c.find("string-name") +                if raw_name: +                    raw_name = clean(raw_name.string) + +                if not raw_name: +                    if given and surname: +                        raw_name = "{} {}".format(given, surname) +                    elif surname: +                        raw_name = surname + +                role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author')) +                if not role and c.get('contrib-type'):                      sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c['contrib-type']))                  contribs.append(fatcat_client.ReleaseContrib( -                    role=JSTOR_CONTRIB_MAP.get(c['contrib-type']), -                    raw_name=clean(name), -                    given_name=clean(given.string), -                    surname=clean(surname.string), +                    role=role, +                    raw_name=raw_name, +                    given_name=given, +                    surname=surname,                  ))          release_year = None @@ -164,7 +197,7 @@ class JstorImporter(EntityImporter):          language = None          cm = article_meta.find("custom-meta")          if cm.find("meta-name").string == "lang": -            language = cm.find("meta-value").string +            language = cm.find("meta-value").string.split()[0]              language = LANG_MAP_MARC.get(language)              if not language:                  warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) | 
