update journal meta import/transform

author: Bryan Newbold <bnewbold@robocracy.org> 2019-01-25 16:42:28 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-01-25 16:42:28 -0800
commit: 7d55f030cadbdc213e0773a52ad52ccfbfa07cad (patch)
tree: 8a19720c83521e81a865dfd1a5a7a6890652e49d /python/fatcat_tools/importers
parent: 2e6e5d9b270044d3462a95512a12520650cc45af (diff)
download: fatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.tar.gz
fatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.zip
1 files changed, 39 insertions, 104 deletions
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index cf3971b5..7f6b1ee8 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -28,90 +28,9 @@ def truthy(s):
 class JournalMetadataImporter(EntityImporter):
     """
     Imports journal metadata ("containers") by ISSN, currently from a custom
-    (data munged) .csv file format
-
-    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-
-        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
-
-
-    'extra' fields:
-
-        doaj
-            as_of: datetime of most recent check; if not set, not actually in DOAJ
-            seal: bool
-            work_level: bool (are work-level publications deposited with DOAJ?)
-            archiving: array, can include 'library' or 'other'
-        road
-            as_of: datetime of most recent check; if not set, not actually in ROAD
-        pubmed (TODO: delete?)
-            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
-        norwegian (TODO: drop this?)
-            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
-            id (integer)
-            level (integer; 0-2)
-        kbart
-            lockss
-                year_rle
-                volume_rle
-            portico
-                ...
-            clockss
-                ...
-        sherpa_romeo
-            color
-        jstor
-            year_rle
-            volume_rle
-        scopus
-            id
-            TODO: print/electronic distinction?
-        wos
-            id
-        doi
-            crossref_doi: DOI of the title in crossref (if exists)
-            prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
-        ia
-            sim
-                nap_id
-                year_rle
-                volume_rle
-            longtail: boolean
-            homepage
-                as_of: datetime of last attempt
-                url
-                status: HTTP/heritrix status of homepage crawl
-
-        issnp: string
-        issne: string
-        coden: string
-        abbrev: string
-        oclc_id: string (TODO: lookup?)
-        lccn_id: string (TODO: lookup?)
-        dblb_id: string
-        default_license: slug
-        original_name: native name (if name is translated)
-        platform: hosting platform: OJS, wordpress, scielo, etc
-        mimetypes: array of strings (eg, 'application/pdf', 'text/html')
-        first_year: year (integer)
-        last_year: if publishing has stopped
-        primary_language: single ISO code, or 'mixed'
-        languages: array of ISO codes
-        region: TODO: continent/world-region
-        nation: shortcode of nation
-        discipline: TODO: highest-level subject; "life science", "humanities", etc
-        field: TODO: narrower description of field
-        subjects: TODO?
-        url: homepage
-        is_oa: boolean. If true, can assume all releases under this container are "Open Access"
-        TODO: domains, if exclusive?
-        TODO: fulltext_regex, if a known pattern?
-
-    For KBART, etc:
-        We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
-        year and volume spans are run-length-encoded arrays, using integers:
-            - if an integer, means that year is preserved
-            - if an array of length 2, means everything between the two numbers (inclusive) is preserved
+    munged JSON format (see ../extra/journal_metadata/).
+
+    See guide for details on the many 'extra' fields used here.
     """
 
     def __init__(self, api, **kwargs):
@@ -125,34 +44,50 @@ class JournalMetadataImporter(EntityImporter):
             editgroup_extra=eg_extra)
 
     def want(self, raw_record):
-        if raw_record.get('ISSN-L'):
+        if raw_record.get('issnl'):
             return True
         return False
 
     def parse_record(self, row):
         """
-        row is a python dict (parsed from CSV).
+        row is a python dict (parsed from JSON).
+
         returns a ContainerEntity (or None if invalid or couldn't parse)
         """
-        title = or_none(row['title'])
-        issnl = or_none(row['ISSN-L'])
-        if title is None or issnl is None:
-            return None
-        extra = dict(
-            in_doaj=truthy(row['in_doaj']),
-            in_road=truthy(row['in_road']),
-            in_norwegian=truthy(row['in_norwegian']),
-            language=or_none(row['lang']),
-            url=or_none(row['url']),
-            ISSNp=or_none(row['ISSN-print']),
-            ISSNe=or_none(row['ISSN-electronic']),
-            is_oa=truthy(row['is_oa']),
-            is_kept=truthy(row['is_kept']),
-        )
+
+        extra = dict()
+        for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev',
+            'coden', 'aliases', 'original_name', 'first_year', 'last_year',
+            'platform', 'default_license', 'road', 'mimetypes',
+            'sherpa_romeo', 'kbart'):
+            if row.get(key):
+                extra[key] = row[key]
+        # TODO: not including for now: norwegian, dois/crossref, ia
+
+        extra_doaj = dict()
+        if row.get('doaj'):
+            if row['doaj'].get('as_of'):
+                extra_doaj['as_of'] = row['doaj']['as_of']
+            if row['doaj'].get('works'):
+                extra_doaj['works'] = row['doaj']['works']
+        if extra_doaj:
+            extra['doaj'] = extra_doaj
+
+        extra_ia = dict()
+        # TODO: would like an ia.longtail_ia flag
+        if row.get('sim'):
+            extra_ia['sim'] = {
+                'year_spans': row['sim']['year_spans'],
+            }
+        if extra_ia:
+            extra['ia'] = extra_ia
+
         ce = fatcat_client.ContainerEntity(
-            issnl=issnl,
-            name=clean(title),
-            publisher=or_none(clean(row['publisher'])),
+            issnl=row['issnl'],
+            container_type=None, # TODO
+            name=clean(row.get('name')),
+            publisher=clean(row.get('publisher')),
+            wikidata_qid=None, # TODO
             extra=extra)
         return ce
author	Bryan Newbold <bnewbold@robocracy.org>	2019-01-25 16:42:28 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-01-25 16:42:28 -0800
commit	7d55f030cadbdc213e0773a52ad52ccfbfa07cad (patch)
tree	8a19720c83521e81a865dfd1a5a7a6890652e49d /python/fatcat_tools/importers
parent	2e6e5d9b270044d3462a95512a12520650cc45af (diff)
download	fatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.tar.gz fatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.zip