aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-25 16:42:28 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-25 16:42:28 -0800
commit7d55f030cadbdc213e0773a52ad52ccfbfa07cad (patch)
tree8a19720c83521e81a865dfd1a5a7a6890652e49d /python/fatcat_tools/importers
parent2e6e5d9b270044d3462a95512a12520650cc45af (diff)
downloadfatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.tar.gz
fatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.zip
update journal meta import/transform
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py143
1 files changed, 39 insertions, 104 deletions
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index cf3971b5..7f6b1ee8 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -28,90 +28,9 @@ def truthy(s):
class JournalMetadataImporter(EntityImporter):
"""
Imports journal metadata ("containers") by ISSN, currently from a custom
- (data munged) .csv file format
-
- CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-
- ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
-
-
- 'extra' fields:
-
- doaj
- as_of: datetime of most recent check; if not set, not actually in DOAJ
- seal: bool
- work_level: bool (are work-level publications deposited with DOAJ?)
- archiving: array, can include 'library' or 'other'
- road
- as_of: datetime of most recent check; if not set, not actually in ROAD
- pubmed (TODO: delete?)
- as_of: datetime of most recent check; if not set, not actually indexed in pubmed
- norwegian (TODO: drop this?)
- as_of: datetime of most recent check; if not set, not actually indexed in pubmed
- id (integer)
- level (integer; 0-2)
- kbart
- lockss
- year_rle
- volume_rle
- portico
- ...
- clockss
- ...
- sherpa_romeo
- color
- jstor
- year_rle
- volume_rle
- scopus
- id
- TODO: print/electronic distinction?
- wos
- id
- doi
- crossref_doi: DOI of the title in crossref (if exists)
- prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
- ia
- sim
- nap_id
- year_rle
- volume_rle
- longtail: boolean
- homepage
- as_of: datetime of last attempt
- url
- status: HTTP/heritrix status of homepage crawl
-
- issnp: string
- issne: string
- coden: string
- abbrev: string
- oclc_id: string (TODO: lookup?)
- lccn_id: string (TODO: lookup?)
- dblb_id: string
- default_license: slug
- original_name: native name (if name is translated)
- platform: hosting platform: OJS, wordpress, scielo, etc
- mimetypes: array of strings (eg, 'application/pdf', 'text/html')
- first_year: year (integer)
- last_year: if publishing has stopped
- primary_language: single ISO code, or 'mixed'
- languages: array of ISO codes
- region: TODO: continent/world-region
- nation: shortcode of nation
- discipline: TODO: highest-level subject; "life science", "humanities", etc
- field: TODO: narrower description of field
- subjects: TODO?
- url: homepage
- is_oa: boolean. If true, can assume all releases under this container are "Open Access"
- TODO: domains, if exclusive?
- TODO: fulltext_regex, if a known pattern?
-
- For KBART, etc:
- We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
- year and volume spans are run-length-encoded arrays, using integers:
- - if an integer, means that year is preserved
- - if an array of length 2, means everything between the two numbers (inclusive) is preserved
+ munged JSON format (see ../extra/journal_metadata/).
+
+ See guide for details on the many 'extra' fields used here.
"""
def __init__(self, api, **kwargs):
@@ -125,34 +44,50 @@ class JournalMetadataImporter(EntityImporter):
editgroup_extra=eg_extra)
def want(self, raw_record):
- if raw_record.get('ISSN-L'):
+ if raw_record.get('issnl'):
return True
return False
def parse_record(self, row):
"""
- row is a python dict (parsed from CSV).
+ row is a python dict (parsed from JSON).
+
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
- title = or_none(row['title'])
- issnl = or_none(row['ISSN-L'])
- if title is None or issnl is None:
- return None
- extra = dict(
- in_doaj=truthy(row['in_doaj']),
- in_road=truthy(row['in_road']),
- in_norwegian=truthy(row['in_norwegian']),
- language=or_none(row['lang']),
- url=or_none(row['url']),
- ISSNp=or_none(row['ISSN-print']),
- ISSNe=or_none(row['ISSN-electronic']),
- is_oa=truthy(row['is_oa']),
- is_kept=truthy(row['is_kept']),
- )
+
+ extra = dict()
+ for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev',
+ 'coden', 'aliases', 'original_name', 'first_year', 'last_year',
+ 'platform', 'default_license', 'road', 'mimetypes',
+ 'sherpa_romeo', 'kbart'):
+ if row.get(key):
+ extra[key] = row[key]
+ # TODO: not including for now: norwegian, dois/crossref, ia
+
+ extra_doaj = dict()
+ if row.get('doaj'):
+ if row['doaj'].get('as_of'):
+ extra_doaj['as_of'] = row['doaj']['as_of']
+ if row['doaj'].get('works'):
+ extra_doaj['works'] = row['doaj']['works']
+ if extra_doaj:
+ extra['doaj'] = extra_doaj
+
+ extra_ia = dict()
+ # TODO: would like an ia.longtail_ia flag
+ if row.get('sim'):
+ extra_ia['sim'] = {
+ 'year_spans': row['sim']['year_spans'],
+ }
+ if extra_ia:
+ extra['ia'] = extra_ia
+
ce = fatcat_client.ContainerEntity(
- issnl=issnl,
- name=clean(title),
- publisher=or_none(clean(row['publisher'])),
+ issnl=row['issnl'],
+ container_type=None, # TODO
+ name=clean(row.get('name')),
+ publisher=clean(row.get('publisher')),
+ wikidata_qid=None, # TODO
extra=extra)
return ce