From 7d55f030cadbdc213e0773a52ad52ccfbfa07cad Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 25 Jan 2019 16:42:28 -0800 Subject: update journal meta import/transform --- python/fatcat_tools/importers/journal_metadata.py | 143 ++++++---------------- 1 file changed, 39 insertions(+), 104 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index cf3971b5..7f6b1ee8 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -28,90 +28,9 @@ def truthy(s): class JournalMetadataImporter(EntityImporter): """ Imports journal metadata ("containers") by ISSN, currently from a custom - (data munged) .csv file format - - CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): - - ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count - - - 'extra' fields: - - doaj - as_of: datetime of most recent check; if not set, not actually in DOAJ - seal: bool - work_level: bool (are work-level publications deposited with DOAJ?) - archiving: array, can include 'library' or 'other' - road - as_of: datetime of most recent check; if not set, not actually in ROAD - pubmed (TODO: delete?) - as_of: datetime of most recent check; if not set, not actually indexed in pubmed - norwegian (TODO: drop this?) - as_of: datetime of most recent check; if not set, not actually indexed in pubmed - id (integer) - level (integer; 0-2) - kbart - lockss - year_rle - volume_rle - portico - ... - clockss - ... - sherpa_romeo - color - jstor - year_rle - volume_rle - scopus - id - TODO: print/electronic distinction? - wos - id - doi - crossref_doi: DOI of the title in crossref (if exists) - prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref) - ia - sim - nap_id - year_rle - volume_rle - longtail: boolean - homepage - as_of: datetime of last attempt - url - status: HTTP/heritrix status of homepage crawl - - issnp: string - issne: string - coden: string - abbrev: string - oclc_id: string (TODO: lookup?) - lccn_id: string (TODO: lookup?) - dblb_id: string - default_license: slug - original_name: native name (if name is translated) - platform: hosting platform: OJS, wordpress, scielo, etc - mimetypes: array of strings (eg, 'application/pdf', 'text/html') - first_year: year (integer) - last_year: if publishing has stopped - primary_language: single ISO code, or 'mixed' - languages: array of ISO codes - region: TODO: continent/world-region - nation: shortcode of nation - discipline: TODO: highest-level subject; "life science", "humanities", etc - field: TODO: narrower description of field - subjects: TODO? - url: homepage - is_oa: boolean. If true, can assume all releases under this container are "Open Access" - TODO: domains, if exclusive? - TODO: fulltext_regex, if a known pattern? - - For KBART, etc: - We "over-count" on the assumption that "in-progress" status works will soon actually be preserved. - year and volume spans are run-length-encoded arrays, using integers: - - if an integer, means that year is preserved - - if an array of length 2, means everything between the two numbers (inclusive) is preserved + munged JSON format (see ../extra/journal_metadata/). + + See guide for details on the many 'extra' fields used here. """ def __init__(self, api, **kwargs): @@ -125,34 +44,50 @@ class JournalMetadataImporter(EntityImporter): editgroup_extra=eg_extra) def want(self, raw_record): - if raw_record.get('ISSN-L'): + if raw_record.get('issnl'): return True return False def parse_record(self, row): """ - row is a python dict (parsed from CSV). + row is a python dict (parsed from JSON). + returns a ContainerEntity (or None if invalid or couldn't parse) """ - title = or_none(row['title']) - issnl = or_none(row['ISSN-L']) - if title is None or issnl is None: - return None - extra = dict( - in_doaj=truthy(row['in_doaj']), - in_road=truthy(row['in_road']), - in_norwegian=truthy(row['in_norwegian']), - language=or_none(row['lang']), - url=or_none(row['url']), - ISSNp=or_none(row['ISSN-print']), - ISSNe=or_none(row['ISSN-electronic']), - is_oa=truthy(row['is_oa']), - is_kept=truthy(row['is_kept']), - ) + + extra = dict() + for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev', + 'coden', 'aliases', 'original_name', 'first_year', 'last_year', + 'platform', 'default_license', 'road', 'mimetypes', + 'sherpa_romeo', 'kbart'): + if row.get(key): + extra[key] = row[key] + # TODO: not including for now: norwegian, dois/crossref, ia + + extra_doaj = dict() + if row.get('doaj'): + if row['doaj'].get('as_of'): + extra_doaj['as_of'] = row['doaj']['as_of'] + if row['doaj'].get('works'): + extra_doaj['works'] = row['doaj']['works'] + if extra_doaj: + extra['doaj'] = extra_doaj + + extra_ia = dict() + # TODO: would like an ia.longtail_ia flag + if row.get('sim'): + extra_ia['sim'] = { + 'year_spans': row['sim']['year_spans'], + } + if extra_ia: + extra['ia'] = extra_ia + ce = fatcat_client.ContainerEntity( - issnl=issnl, - name=clean(title), - publisher=or_none(clean(row['publisher'])), + issnl=row['issnl'], + container_type=None, # TODO + name=clean(row.get('name')), + publisher=clean(row.get('publisher')), + wikidata_qid=None, # TODO extra=extra) return ce -- cgit v1.2.3