1 files changed, 183 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
new file mode 100644
index 00000000..cf3971b5
--- /dev/null
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -0,0 +1,183 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from .common import EntityImporter, clean
+
+
+def or_none(s):
+    if s is None:
+        return None
+    if len(s) == 0:
+        return None
+    return s
+
+def truthy(s):
+    if s is None:
+        return None
+    s = s.lower()
+
+    if s in ('true', 't', 'yes', 'y', '1'):
+        return True
+    elif s in ('false', 'f', 'no', 'n', '0'):
+        return False
+    else:
+        return None
+
+class JournalMetadataImporter(EntityImporter):
+    """
+    Imports journal metadata ("containers") by ISSN, currently from a custom
+    (data munged) .csv file format
+
+    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+
+        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+
+
+    'extra' fields:
+
+        doaj
+            as_of: datetime of most recent check; if not set, not actually in DOAJ
+            seal: bool
+            work_level: bool (are work-level publications deposited with DOAJ?)
+            archiving: array, can include 'library' or 'other'
+        road
+            as_of: datetime of most recent check; if not set, not actually in ROAD
+        pubmed (TODO: delete?)
+            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+        norwegian (TODO: drop this?)
+            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+            id (integer)
+            level (integer; 0-2)
+        kbart
+            lockss
+                year_rle
+                volume_rle
+            portico
+                ...
+            clockss
+                ...
+        sherpa_romeo
+            color
+        jstor
+            year_rle
+            volume_rle
+        scopus
+            id
+            TODO: print/electronic distinction?
+        wos
+            id
+        doi
+            crossref_doi: DOI of the title in crossref (if exists)
+            prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
+        ia
+            sim
+                nap_id
+                year_rle
+                volume_rle
+            longtail: boolean
+            homepage
+                as_of: datetime of last attempt
+                url
+                status: HTTP/heritrix status of homepage crawl
+
+        issnp: string
+        issne: string
+        coden: string
+        abbrev: string
+        oclc_id: string (TODO: lookup?)
+        lccn_id: string (TODO: lookup?)
+        dblb_id: string
+        default_license: slug
+        original_name: native name (if name is translated)
+        platform: hosting platform: OJS, wordpress, scielo, etc
+        mimetypes: array of strings (eg, 'application/pdf', 'text/html')
+        first_year: year (integer)
+        last_year: if publishing has stopped
+        primary_language: single ISO code, or 'mixed'
+        languages: array of ISO codes
+        region: TODO: continent/world-region
+        nation: shortcode of nation
+        discipline: TODO: highest-level subject; "life science", "humanities", etc
+        field: TODO: narrower description of field
+        subjects: TODO?
+        url: homepage
+        is_oa: boolean. If true, can assume all releases under this container are "Open Access"
+        TODO: domains, if exclusive?
+        TODO: fulltext_regex, if a known pattern?
+
+    For KBART, etc:
+        We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
+        year and volume spans are run-length-encoded arrays, using integers:
+            - if an integer, means that year is preserved
+            - if an array of length 2, means everything between the two numbers (inclusive) is preserved
+    """
+
+    def __init__(self, api, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter')
+        super().__init__(api,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra)
+
+    def want(self, raw_record):
+        if raw_record.get('ISSN-L'):
+            return True
+        return False
+
+    def parse_record(self, row):
+        """
+        row is a python dict (parsed from CSV).
+        returns a ContainerEntity (or None if invalid or couldn't parse)
+        """
+        title = or_none(row['title'])
+        issnl = or_none(row['ISSN-L'])
+        if title is None or issnl is None:
+            return None
+        extra = dict(
+            in_doaj=truthy(row['in_doaj']),
+            in_road=truthy(row['in_road']),
+            in_norwegian=truthy(row['in_norwegian']),
+            language=or_none(row['lang']),
+            url=or_none(row['url']),
+            ISSNp=or_none(row['ISSN-print']),
+            ISSNe=or_none(row['ISSN-electronic']),
+            is_oa=truthy(row['is_oa']),
+            is_kept=truthy(row['is_kept']),
+        )
+        ce = fatcat_client.ContainerEntity(
+            issnl=issnl,
+            name=clean(title),
+            publisher=or_none(clean(row['publisher'])),
+            extra=extra)
+        return ce
+
+    def try_update(self, ce):
+
+        existing = None
+        try:
+            existing = self.api.lookup_container(issnl=ce.issnl)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+            # doesn't exist, need to update
+            return True
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+        
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_container_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
+