diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 182 | 
1 files changed, 125 insertions, 57 deletions
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 74ac9a0e..c0e75283 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -4,17 +4,15 @@ Importer for DOAJ article-level metadata, schema v1.  DOAJ API schema and docs: https://doaj.org/api/v1/docs  """ -import collections +import warnings  import datetime -import sys -from typing import List, Dict, Optional - -import langdetect +from typing import List, Optional  import fatcat_openapi_client -from fatcat_tools.normal import clean_doi -from fatcat_tools.transforms import entity_to_dict -from fatcat_tools.importers.common import EntityImporter, clean +from fatcat_tools.normal import (clean_doi, clean_str, parse_month, +    clean_orcid, detect_text_lang, parse_lang_name, parse_country_name, +    clean_pmid, clean_pmcid) +from fatcat_tools.importers.common import EntityImporter  # Cutoff length for abstracts.  MAX_ABSTRACT_LENGTH = 2048 @@ -48,7 +46,6 @@ class DoajArticleImporter(EntityImporter):      def want(self, obj):          return True -      def parse_record(self, obj):          """          bibjson { @@ -74,14 +71,6 @@ class DoajArticleImporter(EntityImporter):              title (string, optional),              volume (string, optional)          } - -        TODO: -        - release_date -        - container_id -        - issue (number?) -        - license is article license; import as slug -        - "open_access" flag in doaj_meta -        - container lookup from issns ("issns" key)          """          if not obj or not isinstance(obj, dict) or not 'bibjson' in obj: @@ -90,42 +79,51 @@ class DoajArticleImporter(EntityImporter):          bibjson = obj['bibjson'] -        title = clean(bibjson.get('title')) +        title = clean_str(bibjson.get('title'), force_xml=True)          if not title:              self.counts['skip-title'] += 1              return False +        container_name = clean_str(bibjson['journal']['title'])          container_id = None -        container_name = None - -        volume = clean(bibjson['journal'].get('volume')) -        number = clean(bibjson['journal'].get('number')) -        publisher = clean(bibjson['journal'].get('publisher')) +        # NOTE: 'issns' not documented in API schema +        for issn in bibjson['journal']['issns']: +            issnl = self.issn2issnl(issn) +            if issnl: +                container_id = self.lookup_issnl(self.issn2issnl(issn)) +            if container_id: +                # don't store container_name when we have an exact match +                container_name = None +                break + +        volume = clean_str(bibjson['journal'].get('volume')) +        # NOTE: this schema seems to use "number" as "issue number" +        issue = clean_str(bibjson['journal'].get('number')) +        publisher = clean_str(bibjson['journal'].get('publisher'))          try:              release_year = int(bibjson.get('year'))          except (TypeError, ValueError):              release_year = None -        # XXX: parse_month -        release_month = clean(bibjson.get('year')) +        release_month = parse_month(clean_str(bibjson.get('month')))          # block bogus far-future years/dates          if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):              release_month = None              release_year = None -        # country -        country = None -        # XXX: country = parse_country(bibjson['journal'].get('country')) - -        # language +        license_slug = self.doaj_license_slug(bibjson['journal'].get('license')) +        country = parse_country_name(bibjson['journal'].get('country'))          language = None -        # XXX: language = parse_language(bibjson['journal'].get('language')) +        for raw in bibjson['journal'].get('language') or []: +            language = parse_lang_name(raw) +            if language: +                break          # pages -        # TODO: error in API docs? seems like start_page not under 'journal' object -        start_page = clean(bibjson['journal'].get('start_page')) or clean(bibjson.get('start_page')) -        end_page = clean(bibjson['journal'].get('end_page')) or clean(bibjson.get('end_page')) +        # NOTE: error in API docs? seems like start_page not under 'journal' object +        start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page')) +        end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page'))          pages: Optional[str] = None          if start_page and end_page:              pages = f"{start_page}-{end_page}" @@ -136,13 +134,13 @@ class DoajArticleImporter(EntityImporter):          ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id)          abstracts = self.doaj_abstracts(bibjson)          contribs = self.doaj_contribs(bibjson.get('author') or []) -             +          # DOAJ-specific extra          doaj_extra = dict()          if bibjson.get('subject'):              doaj_extra['subject'] = bibjson.get('subject')          if bibjson.get('keywords'): -            doaj_extra['keywords'] = [k for k in [clean(s) for s in bibjson.get('keywords')] if k] +            doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k]          # generic extra          extra = dict() @@ -171,13 +169,12 @@ class DoajArticleImporter(EntityImporter):              ext_ids=ext_ids,              contribs=contribs,              volume=volume, -            number=number, # XXX -            #issue, +            issue=issue,              pages=pages,              language=language,              abstracts=abstracts,              extra=extra, -            #license_slug=license_slug, +            license_slug=license_slug,          )          re = self.biblio_hacks(re)          return re @@ -192,7 +189,7 @@ class DoajArticleImporter(EntityImporter):      def try_update(self, re): -        # lookup existing DOI (don't need to try other ext idents for crossref) +        # lookup existing release by DOAJ article id          existing = None          try:              existing = self.api.lookup_release(doaj=re.ext_ids.doaj) @@ -202,13 +199,62 @@ class DoajArticleImporter(EntityImporter):              # doesn't exist, need to update              return True -        # eventually we'll want to support "updates", but for now just skip if -        # entity already exists -        if existing: +        # then try other ext_id lookups +        if not existing: +            for extid_type in ('doi', 'pmid', 'pmcid'): +                extid_val = re.ext_ids.__dict__[extid_type] +                if not extid_val: +                    continue +                try: +                    existing = self.api.lookup_release(**{extid_type: extid_val}) +                except fatcat_openapi_client.rest.ApiException as err: +                    if err.status != 404: +                        raise err +                if existing: +                    if existing.ext_ids.doaj: +                        warn_str = f"unexpected DOAJ ext_id match after lookup failed doaj={re.ext_ids.doaj} ident={existing.ident}" +                        warnings.warn(warn_str) +                        self.counts["skip-doaj-id-mismatch"] += 1 +                        return None +                    break + +        # TODO: in the future could do fuzzy match here, eg using elasticsearch + +        # create entity +        if not existing: +            return True + +        # other logic could go here about skipping updates +        if not self.do_updates or existing.ext_ids.doaj:              self.counts['exists'] += 1              return False -        return True +        # fields to copy over for update +        existing.ext_ids.doaj = existing.ext_ids.doaj or re.ext_ids.doaj +        existing.release_type = existing.release_type or re.release_type +        existing.release_stage = existing.release_stage or re.release_stage +        existing.container_id = existing.container_id or re.container_id +        existing.abstracts = existing.abstracts or re.abstracts +        existing.extra['doaj'] = re.extra['doaj'] +        existing.volume = existing.volume or re.volume +        existing.issue = existing.issue or re.issue +        existing.pages = existing.pages or re.pages +        existing.language = existing.language or re.language + +        try: +            self.api.update_release(self.get_editgroup_id(), existing.ident, existing) +            self.counts['update'] += 1 +        except fatcat_openapi_client.rest.ApiException as err: +            # there is a code path where we try to update the same release +            # twice in a row; if that happens, just skip +            # NOTE: API behavior might change in the future? +            if "release_edit_editgroup_id_ident_id_key" in err.body: +                self.counts['skip-update-conflict'] += 1 +                return False +            else: +                raise err + +        return False      def insert_batch(self, batch):          self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( @@ -218,19 +264,13 @@ class DoajArticleImporter(EntityImporter):              entity_list=batch))      def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]: -        text = clean(bibjson['abstract']) +        text = clean_str(bibjson.get('abstract'))          if not text or len(text) < 10:              return []          if len(text) > MAX_ABSTRACT_LENGTH:              text = text[:MAX_ABSTRACT_LENGTH] + " [...]" -        # Detect language. This is fuzzy and may be removed, if too unreliable. -        lang = None -        try: -            lang = langdetect.detect(text) -        except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: -            #print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) -            pass +        lang = detect_text_lang(text)          abstract = fatcat_openapi_client.ReleaseAbstract(              mimetype="text/plain", @@ -249,15 +289,22 @@ class DoajArticleImporter(EntityImporter):          }          """          contribs = [] -        # TODO: index? +        index = 0          for author in authors:              if not author.get('name'):                  continue +            creator_id = None +            orcid = clean_orcid(author.get('orcid_id')) +            if orcid: +                creator_id = self.lookup_orcid(orcid)              contribs.append(fatcat_openapi_client.ReleaseContrib(                  raw_name=author.get('name'), -                # XXX: orcid_id=author.get('orcid_id') or None, -                # XXX: affiliation=author.get('affiliation') or None, +                role='author', +                index=index, +                creator_id=creator_id, +                raw_affiliation=clean_str(author.get('affiliation')),              )) +            index += 1          return contribs      def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds: @@ -277,9 +324,9 @@ class DoajArticleImporter(EntityImporter):              if id_obj['type'].lower() == 'doi':                  doi = clean_doi(id_obj['id'])              elif id_obj['type'].lower() == 'pmid': -                pmid = id_obj['id'] +                pmid = clean_pmid(id_obj['id'])              elif id_obj['type'].lower() == 'pmcid': -                pmcid = id_obj['id'] +                pmcid = clean_pmcid(id_obj['id'])          return fatcat_openapi_client.ReleaseExtIds(              doaj=doaj_article_id, @@ -287,3 +334,24 @@ class DoajArticleImporter(EntityImporter):              pmid=pmid,              pmcid=pmcid,          ) + +    def doaj_license_slug(self, license_list: List[dict]) -> Optional[str]: +        """ +        bibjson.journal.license { +            open_access (boolean, optional), +            title (string, optional), +            type (string, optional), +            url (string, optional), +            version (string, optional) +        } +        """ +        if not license_list: +            return None +        for license in license_list: +            if not license.get('open_access'): +                continue +            slug = license.get('type') +            if slug.startswith('CC '): +                slug = slug.replace('CC ', 'cc-').lower() +                return slug +        return None  | 
