diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-25 16:42:28 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-25 16:42:28 -0800 | 
| commit | 7d55f030cadbdc213e0773a52ad52ccfbfa07cad (patch) | |
| tree | 8a19720c83521e81a865dfd1a5a7a6890652e49d /python/fatcat_tools/importers | |
| parent | 2e6e5d9b270044d3462a95512a12520650cc45af (diff) | |
| download | fatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.tar.gz fatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.zip | |
update journal meta import/transform
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 143 | 
1 files changed, 39 insertions, 104 deletions
| diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index cf3971b5..7f6b1ee8 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -28,90 +28,9 @@ def truthy(s):  class JournalMetadataImporter(EntityImporter):      """      Imports journal metadata ("containers") by ISSN, currently from a custom -    (data munged) .csv file format - -    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): - -        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count - - -    'extra' fields: - -        doaj -            as_of: datetime of most recent check; if not set, not actually in DOAJ -            seal: bool -            work_level: bool (are work-level publications deposited with DOAJ?) -            archiving: array, can include 'library' or 'other' -        road -            as_of: datetime of most recent check; if not set, not actually in ROAD -        pubmed (TODO: delete?) -            as_of: datetime of most recent check; if not set, not actually indexed in pubmed -        norwegian (TODO: drop this?) -            as_of: datetime of most recent check; if not set, not actually indexed in pubmed -            id (integer) -            level (integer; 0-2) -        kbart -            lockss -                year_rle -                volume_rle -            portico -                ... -            clockss -                ... -        sherpa_romeo -            color -        jstor -            year_rle -            volume_rle -        scopus -            id -            TODO: print/electronic distinction? -        wos -            id -        doi -            crossref_doi: DOI of the title in crossref (if exists) -            prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref) -        ia -            sim -                nap_id -                year_rle -                volume_rle -            longtail: boolean -            homepage -                as_of: datetime of last attempt -                url -                status: HTTP/heritrix status of homepage crawl - -        issnp: string -        issne: string -        coden: string -        abbrev: string -        oclc_id: string (TODO: lookup?) -        lccn_id: string (TODO: lookup?) -        dblb_id: string -        default_license: slug -        original_name: native name (if name is translated) -        platform: hosting platform: OJS, wordpress, scielo, etc -        mimetypes: array of strings (eg, 'application/pdf', 'text/html') -        first_year: year (integer) -        last_year: if publishing has stopped -        primary_language: single ISO code, or 'mixed' -        languages: array of ISO codes -        region: TODO: continent/world-region -        nation: shortcode of nation -        discipline: TODO: highest-level subject; "life science", "humanities", etc -        field: TODO: narrower description of field -        subjects: TODO? -        url: homepage -        is_oa: boolean. If true, can assume all releases under this container are "Open Access" -        TODO: domains, if exclusive? -        TODO: fulltext_regex, if a known pattern? - -    For KBART, etc: -        We "over-count" on the assumption that "in-progress" status works will soon actually be preserved. -        year and volume spans are run-length-encoded arrays, using integers: -            - if an integer, means that year is preserved -            - if an array of length 2, means everything between the two numbers (inclusive) is preserved +    munged JSON format (see ../extra/journal_metadata/). + +    See guide for details on the many 'extra' fields used here.      """      def __init__(self, api, **kwargs): @@ -125,34 +44,50 @@ class JournalMetadataImporter(EntityImporter):              editgroup_extra=eg_extra)      def want(self, raw_record): -        if raw_record.get('ISSN-L'): +        if raw_record.get('issnl'):              return True          return False      def parse_record(self, row):          """ -        row is a python dict (parsed from CSV). +        row is a python dict (parsed from JSON). +          returns a ContainerEntity (or None if invalid or couldn't parse)          """ -        title = or_none(row['title']) -        issnl = or_none(row['ISSN-L']) -        if title is None or issnl is None: -            return None -        extra = dict( -            in_doaj=truthy(row['in_doaj']), -            in_road=truthy(row['in_road']), -            in_norwegian=truthy(row['in_norwegian']), -            language=or_none(row['lang']), -            url=or_none(row['url']), -            ISSNp=or_none(row['ISSN-print']), -            ISSNe=or_none(row['ISSN-electronic']), -            is_oa=truthy(row['is_oa']), -            is_kept=truthy(row['is_kept']), -        ) + +        extra = dict() +        for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev', +            'coden', 'aliases', 'original_name', 'first_year', 'last_year', +            'platform', 'default_license', 'road', 'mimetypes', +            'sherpa_romeo', 'kbart'): +            if row.get(key): +                extra[key] = row[key] +        # TODO: not including for now: norwegian, dois/crossref, ia + +        extra_doaj = dict() +        if row.get('doaj'): +            if row['doaj'].get('as_of'): +                extra_doaj['as_of'] = row['doaj']['as_of'] +            if row['doaj'].get('works'): +                extra_doaj['works'] = row['doaj']['works'] +        if extra_doaj: +            extra['doaj'] = extra_doaj + +        extra_ia = dict() +        # TODO: would like an ia.longtail_ia flag +        if row.get('sim'): +            extra_ia['sim'] = { +                'year_spans': row['sim']['year_spans'], +            } +        if extra_ia: +            extra['ia'] = extra_ia +          ce = fatcat_client.ContainerEntity( -            issnl=issnl, -            name=clean(title), -            publisher=or_none(clean(row['publisher'])), +            issnl=row['issnl'], +            container_type=None, # TODO +            name=clean(row.get('name')), +            publisher=clean(row.get('publisher')), +            wikidata_qid=None, # TODO              extra=extra)          return ce | 
