diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-23 17:35:36 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-23 17:35:36 -0800 | 
| commit | 1536109fff643767b6e8f8515fe0eb4d8cae5854 (patch) | |
| tree | 59bf1aa1ebb4f9cabdb4fb2e7130c302973d5ec0 /python/fatcat_tools/importers | |
| parent | a75717bc876c0888064fbc9a3bf69d1954a7c0cc (diff) | |
| download | fatcat-1536109fff643767b6e8f8515fe0eb4d8cae5854.tar.gz fatcat-1536109fff643767b6e8f8515fe0eb4d8cae5854.zip | |
bunch of crossref import tweaks (need tests)
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 93 | 
1 files changed, 43 insertions, 50 deletions
| diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index cbb6deb5..75132901 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -114,25 +114,10 @@ class CrossrefImporter(EntityImporter):          return CONTAINER_TYPE_MAP.get(release_type)      def want(self, obj): - -        # Ways to be out of scope (provisionally) -        # journal-issue and journal-volume map to None, but allowed for now -        if obj.get('type') in (None, 'journal', 'proceedings', -                'standard-series', 'report-series', 'book-series', 'book-set', -                'book-track', 'proceedings-series'): -            return False - -        # Do require the 'title' keys to exsit, as release entities do -        if (not 'title' in obj) or (not obj['title']): -            return False - -        # Can't handle such large lists yet -        authors = len(obj.get('author', [])) -        abstracts = len(obj.get('abstract', [])) -        refs = len(obj.get('reference', [])) -        if max(authors, abstracts, refs) > 750: +        if not obj.get('title'):              return False +        # do most of these checks in-line below          return True      def parse_record(self, obj): @@ -141,6 +126,17 @@ class CrossrefImporter(EntityImporter):          returns a ReleaseEntity          """ +        # Ways to be out of scope (provisionally) +        # journal-issue and journal-volume map to None, but allowed for now +        if obj.get('type') in (None, 'journal', 'proceedings', +                'standard-series', 'report-series', 'book-series', 'book-set', +                'book-track', 'proceedings-series'): +            return None + +        # Do require the 'title' keys to exsit, as release entities do +        if (not 'title' in obj) or (not obj['title']): +            return None +          release_type = self.map_release_type(obj['type'])          # contribs @@ -168,10 +164,10 @@ class CrossrefImporter(EntityImporter):                      if len(am.get('affiliation')) > 0:                          raw_affiliation = am.get('affiliation')[0]['name']                      if len(am.get('affiliation')) > 1: -                        # note: affiliation => affiliations -                        extra['affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] +                        # note: affiliation => more_affiliations +                        extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]                  if am.get('sequence') and am.get('sequence') != "additional": -                    extra['sequence'] = am.get('sequence') +                    extra['seq'] = clean(am.get('sequence'))                  if not extra:                      extra = None                  assert ctype in ("author", "editor", "translator") @@ -207,28 +203,28 @@ class CrossrefImporter(EntityImporter):          # license slug          license_slug = None +        license_extra = []          for l in obj.get('license', []):              if l['content-version'] not in ('vor', 'unspecified'):                  continue              slug = LICENSE_SLUG_MAP.get(l['URL'])              if slug:                  license_slug = slug -                break +            if 'start' in l: +                l['start'] = l['start']['date-time'] +            license_extra.append(l)          # references          refs = []          for i, rm in enumerate(obj.get('reference', [])):              try:                  year = int(rm.get('year')) -                # NOTE: will need to update/config in the future! +                # TODO: will need to update/config in the future!                  # NOTE: are there crossref works with year < 100?                  if year > 2025 or year < 100:                      year = None              except:                  year = None -            extra = rm.copy() -            if rm.get('DOI'): -                extra['doi'] = rm.get('DOI').lower()              key = rm.get('key')              if key and key.startswith(obj['DOI'].upper()):                  key = key.replace(obj['DOI'].upper() + "-", '') @@ -236,18 +232,18 @@ class CrossrefImporter(EntityImporter):              container_name = rm.get('volume-title')              if not container_name:                  container_name = rm.get('journal-title') -            ref_locator = rm.get('first-page') -            ref_title = rm.get('title') -            if extra.get('DOI'): -                extra['doi'] = extra['DOI'] -            extra.pop('DOI', None) -            extra.pop('key', None) -            extra.pop('year', None) -            extra.pop('volume-title', None) -            extra.pop('journal-title', None) -            extra.pop('title', None) -            extra.pop('first-page', None) -            extra.pop('doi-asserted-by', None) +            elif rm.get('journal-title'): +                extra['journal-title'] = rm['journal-title'] +            extra = dict() +            if rm.get('DOI'): +                extra['doi'] = rm.get('DOI').lower() +            # TODO: what fields here? CSL citation stuff +            for k in ('authors', 'editor', 'edition', 'authority', 'version', +                    'genre', 'url', 'event', 'issue', 'volume', 'date', +                    'accessed_date', 'issued', 'page', 'medium', +                    'collection_title', 'chapter_number'): +                if clean(rm.get(k)): +                    extra[k] = clean(rm[k])              if extra:                  extra = dict(crossref=extra)              else: @@ -259,8 +255,8 @@ class CrossrefImporter(EntityImporter):                  key=key,                  year=clean(year),                  container_name=clean(container_name), -                title=clean(ref_title), -                locator=clean(ref_locator), +                title=clean(rm.get('title')), +                locator=clean(rm.get('first-page')),                  # TODO: just dump JSON somewhere here?                  extra=extra)) @@ -273,24 +269,20 @@ class CrossrefImporter(EntityImporter):          # extra fields          extra = dict() -        for key in ('subject', 'type', 'license', 'alternative-id', -                'container-title', 'original-title', 'subtitle', 'archive', -                'funder', 'group-title'): -            # TODO: unpack "container-title" array +        for key in ('subject', 'type', 'alternative-id', 'container-title', +                'subtitle', 'archive', 'funder', 'group-title'): +            # TODO: unpack "container-title" array?              val = obj.get(key)              if val:                  if type(val) == str:                      extra[key] = clean(val)                  else:                      extra[key] = val -        if 'license' in extra and extra['license']: -            for i in range(len(extra['license'])): -                if 'start' in extra['license'][i]: -                    extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] +        if license_extra: +            extra['license'] = license_extra +          if len(obj['title']) > 1:              extra['other-titles'] = [clean(t) for t in obj['title'][1:]] -        # TODO: this should be top-level -        extra['is_kept'] = len(obj.get('archive', [])) > 0          # ISBN          isbn13 = None @@ -313,7 +305,8 @@ class CrossrefImporter(EntityImporter):          # TODO: filter out huge releases; we'll get them later (and fix bug in          # fatcatd) -        assert max(len(contribs), len(refs), len(abstracts)) <= 750 +        if max(len(contribs), len(refs), len(abstracts)) > 750: +            return None          # release date parsing is amazingly complex          raw_date = obj['issued']['date-parts'][0] | 
