aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-24 13:06:42 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-24 13:06:42 -0800
commit2cadc1f1bca6fb5c3b7eb940b838023322fc0eeb (patch)
tree3e4c961cd6e73ed27b9b7ef3724fabe40a0819d4
parent92189ad99ae7f799377a0fcbb928e09ff1f82a79 (diff)
downloadfatcat-2cadc1f1bca6fb5c3b7eb940b838023322fc0eeb.tar.gz
fatcat-2cadc1f1bca6fb5c3b7eb940b838023322fc0eeb.zip
notes on refactoring container 'extra'
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py79
1 files changed, 79 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index ccdb7ec6..cf3971b5 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -33,6 +33,85 @@ class JournalMetadataImporter(EntityImporter):
CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+
+
+ 'extra' fields:
+
+ doaj
+ as_of: datetime of most recent check; if not set, not actually in DOAJ
+ seal: bool
+ work_level: bool (are work-level publications deposited with DOAJ?)
+ archiving: array, can include 'library' or 'other'
+ road
+ as_of: datetime of most recent check; if not set, not actually in ROAD
+ pubmed (TODO: delete?)
+ as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+ norwegian (TODO: drop this?)
+ as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+ id (integer)
+ level (integer; 0-2)
+ kbart
+ lockss
+ year_rle
+ volume_rle
+ portico
+ ...
+ clockss
+ ...
+ sherpa_romeo
+ color
+ jstor
+ year_rle
+ volume_rle
+ scopus
+ id
+ TODO: print/electronic distinction?
+ wos
+ id
+ doi
+ crossref_doi: DOI of the title in crossref (if exists)
+ prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
+ ia
+ sim
+ nap_id
+ year_rle
+ volume_rle
+ longtail: boolean
+ homepage
+ as_of: datetime of last attempt
+ url
+ status: HTTP/heritrix status of homepage crawl
+
+ issnp: string
+ issne: string
+ coden: string
+ abbrev: string
+ oclc_id: string (TODO: lookup?)
+ lccn_id: string (TODO: lookup?)
+ dblb_id: string
+ default_license: slug
+ original_name: native name (if name is translated)
+ platform: hosting platform: OJS, wordpress, scielo, etc
+ mimetypes: array of strings (eg, 'application/pdf', 'text/html')
+ first_year: year (integer)
+ last_year: if publishing has stopped
+ primary_language: single ISO code, or 'mixed'
+ languages: array of ISO codes
+ region: TODO: continent/world-region
+ nation: shortcode of nation
+ discipline: TODO: highest-level subject; "life science", "humanities", etc
+ field: TODO: narrower description of field
+ subjects: TODO?
+ url: homepage
+ is_oa: boolean. If true, can assume all releases under this container are "Open Access"
+ TODO: domains, if exclusive?
+ TODO: fulltext_regex, if a known pattern?
+
+ For KBART, etc:
+ We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
+ year and volume spans are run-length-encoded arrays, using integers:
+ - if an integer, means that year is preserved
+ - if an array of length 2, means everything between the two numbers (inclusive) is preserved
"""
def __init__(self, api, **kwargs):