From 2cadc1f1bca6fb5c3b7eb940b838023322fc0eeb Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 24 Jan 2019 13:06:42 -0800
Subject: notes on refactoring container 'extra'

---
 python/fatcat_tools/importers/journal_metadata.py | 79 +++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index ccdb7ec6..cf3971b5 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -33,6 +33,85 @@ class JournalMetadataImporter(EntityImporter):
     CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
 
         ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+
+
+    'extra' fields:
+
+        doaj
+            as_of: datetime of most recent check; if not set, not actually in DOAJ
+            seal: bool
+            work_level: bool (are work-level publications deposited with DOAJ?)
+            archiving: array, can include 'library' or 'other'
+        road
+            as_of: datetime of most recent check; if not set, not actually in ROAD
+        pubmed (TODO: delete?)
+            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+        norwegian (TODO: drop this?)
+            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+            id (integer)
+            level (integer; 0-2)
+        kbart
+            lockss
+                year_rle
+                volume_rle
+            portico
+                ...
+            clockss
+                ...
+        sherpa_romeo
+            color
+        jstor
+            year_rle
+            volume_rle
+        scopus
+            id
+            TODO: print/electronic distinction?
+        wos
+            id
+        doi
+            crossref_doi: DOI of the title in crossref (if exists)
+            prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
+        ia
+            sim
+                nap_id
+                year_rle
+                volume_rle
+            longtail: boolean
+            homepage
+                as_of: datetime of last attempt
+                url
+                status: HTTP/heritrix status of homepage crawl
+
+        issnp: string
+        issne: string
+        coden: string
+        abbrev: string
+        oclc_id: string (TODO: lookup?)
+        lccn_id: string (TODO: lookup?)
+        dblb_id: string
+        default_license: slug
+        original_name: native name (if name is translated)
+        platform: hosting platform: OJS, wordpress, scielo, etc
+        mimetypes: array of strings (eg, 'application/pdf', 'text/html')
+        first_year: year (integer)
+        last_year: if publishing has stopped
+        primary_language: single ISO code, or 'mixed'
+        languages: array of ISO codes
+        region: TODO: continent/world-region
+        nation: shortcode of nation
+        discipline: TODO: highest-level subject; "life science", "humanities", etc
+        field: TODO: narrower description of field
+        subjects: TODO?
+        url: homepage
+        is_oa: boolean. If true, can assume all releases under this container are "Open Access"
+        TODO: domains, if exclusive?
+        TODO: fulltext_regex, if a known pattern?
+
+    For KBART, etc:
+        We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
+        year and volume spans are run-length-encoded arrays, using integers:
+            - if an integer, means that year is preserved
+            - if an array of length 2, means everything between the two numbers (inclusive) is preserved
     """
 
     def __init__(self, api, **kwargs):
-- 
cgit v1.2.3