diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-25 16:42:28 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-25 16:42:28 -0800 |
commit | 7d55f030cadbdc213e0773a52ad52ccfbfa07cad (patch) | |
tree | 8a19720c83521e81a865dfd1a5a7a6890652e49d /python/fatcat_tools | |
parent | 2e6e5d9b270044d3462a95512a12520650cc45af (diff) | |
download | fatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.tar.gz fatcat-7d55f030cadbdc213e0773a52ad52ccfbfa07cad.zip |
update journal meta import/transform
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 143 | ||||
-rw-r--r-- | python/fatcat_tools/transforms.py | 183 |
2 files changed, 194 insertions, 132 deletions
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index cf3971b5..7f6b1ee8 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -28,90 +28,9 @@ def truthy(s): class JournalMetadataImporter(EntityImporter): """ Imports journal metadata ("containers") by ISSN, currently from a custom - (data munged) .csv file format - - CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): - - ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count - - - 'extra' fields: - - doaj - as_of: datetime of most recent check; if not set, not actually in DOAJ - seal: bool - work_level: bool (are work-level publications deposited with DOAJ?) - archiving: array, can include 'library' or 'other' - road - as_of: datetime of most recent check; if not set, not actually in ROAD - pubmed (TODO: delete?) - as_of: datetime of most recent check; if not set, not actually indexed in pubmed - norwegian (TODO: drop this?) - as_of: datetime of most recent check; if not set, not actually indexed in pubmed - id (integer) - level (integer; 0-2) - kbart - lockss - year_rle - volume_rle - portico - ... - clockss - ... - sherpa_romeo - color - jstor - year_rle - volume_rle - scopus - id - TODO: print/electronic distinction? - wos - id - doi - crossref_doi: DOI of the title in crossref (if exists) - prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref) - ia - sim - nap_id - year_rle - volume_rle - longtail: boolean - homepage - as_of: datetime of last attempt - url - status: HTTP/heritrix status of homepage crawl - - issnp: string - issne: string - coden: string - abbrev: string - oclc_id: string (TODO: lookup?) - lccn_id: string (TODO: lookup?) - dblb_id: string - default_license: slug - original_name: native name (if name is translated) - platform: hosting platform: OJS, wordpress, scielo, etc - mimetypes: array of strings (eg, 'application/pdf', 'text/html') - first_year: year (integer) - last_year: if publishing has stopped - primary_language: single ISO code, or 'mixed' - languages: array of ISO codes - region: TODO: continent/world-region - nation: shortcode of nation - discipline: TODO: highest-level subject; "life science", "humanities", etc - field: TODO: narrower description of field - subjects: TODO? - url: homepage - is_oa: boolean. If true, can assume all releases under this container are "Open Access" - TODO: domains, if exclusive? - TODO: fulltext_regex, if a known pattern? - - For KBART, etc: - We "over-count" on the assumption that "in-progress" status works will soon actually be preserved. - year and volume spans are run-length-encoded arrays, using integers: - - if an integer, means that year is preserved - - if an array of length 2, means everything between the two numbers (inclusive) is preserved + munged JSON format (see ../extra/journal_metadata/). + + See guide for details on the many 'extra' fields used here. """ def __init__(self, api, **kwargs): @@ -125,34 +44,50 @@ class JournalMetadataImporter(EntityImporter): editgroup_extra=eg_extra) def want(self, raw_record): - if raw_record.get('ISSN-L'): + if raw_record.get('issnl'): return True return False def parse_record(self, row): """ - row is a python dict (parsed from CSV). + row is a python dict (parsed from JSON). + returns a ContainerEntity (or None if invalid or couldn't parse) """ - title = or_none(row['title']) - issnl = or_none(row['ISSN-L']) - if title is None or issnl is None: - return None - extra = dict( - in_doaj=truthy(row['in_doaj']), - in_road=truthy(row['in_road']), - in_norwegian=truthy(row['in_norwegian']), - language=or_none(row['lang']), - url=or_none(row['url']), - ISSNp=or_none(row['ISSN-print']), - ISSNe=or_none(row['ISSN-electronic']), - is_oa=truthy(row['is_oa']), - is_kept=truthy(row['is_kept']), - ) + + extra = dict() + for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev', + 'coden', 'aliases', 'original_name', 'first_year', 'last_year', + 'platform', 'default_license', 'road', 'mimetypes', + 'sherpa_romeo', 'kbart'): + if row.get(key): + extra[key] = row[key] + # TODO: not including for now: norwegian, dois/crossref, ia + + extra_doaj = dict() + if row.get('doaj'): + if row['doaj'].get('as_of'): + extra_doaj['as_of'] = row['doaj']['as_of'] + if row['doaj'].get('works'): + extra_doaj['works'] = row['doaj']['works'] + if extra_doaj: + extra['doaj'] = extra_doaj + + extra_ia = dict() + # TODO: would like an ia.longtail_ia flag + if row.get('sim'): + extra_ia['sim'] = { + 'year_spans': row['sim']['year_spans'], + } + if extra_ia: + extra['ia'] = extra_ia + ce = fatcat_client.ContainerEntity( - issnl=issnl, - name=clean(title), - publisher=or_none(clean(row['publisher'])), + issnl=row['issnl'], + container_type=None, # TODO + name=clean(row.get('name')), + publisher=clean(row.get('publisher')), + wikidata_qid=None, # TODO extra=extra) return ce diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index 2493b1ab..a85c877c 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -19,7 +19,22 @@ def entity_from_json(json_str, entity_type): thing.data = json_str return ac.deserialize(thing, entity_type) -def release_to_elasticsearch(release): +def check_kbart(year, archive): + if not archive or not archive.get('year_spans'): + return None + for span in archive['year_spans']: + if year >= span[0] and year <= span[1]: + return True + return False + +def test_check_kbart(): + + assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) == False + assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) == True + assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False + assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True + +def release_to_elasticsearch(entity): """ Converts from an entity model/schema to elasticsearch oriented schema. @@ -27,15 +42,16 @@ def release_to_elasticsearch(release): Raises exception on error (never returns None) """ - if release.state in ('redirect', 'deleted'): + if entity.state in ('redirect', 'deleted'): return dict( - ident = release.ident, - state = release.state, + ident = entity.ident, + state = entity.state, ) - elif release.state != 'active': - raise ValueError("Unhandled release state: {}".format(release.state)) + elif entity.state != 'active': + raise ValueError("Unhandled entity state: {}".format(entity.state)) # First, the easy ones (direct copy) + release = entity t = dict( ident = release.ident, state = release.state, @@ -57,11 +73,14 @@ def release_to_elasticsearch(release): ) is_oa = None + is_preserved = None is_longtail_oa = None in_kbart = None + in_jstor = False in_web = False in_dweb = False in_ia = False + in_ia_sim = False in_shadow = False if release.release_date: @@ -88,19 +107,35 @@ def release_to_elasticsearch(release): t['container_issnl'] = container.issnl t['container_type'] = container.container_type if container.extra: - if container.extra.get('is_oa') or container.extra.get('in_doaj'): + c_extra = container.extra + if c_extra.get('kbart') and release.year: + in_jstor = check_kbart(release.year, c_extra['kbart'].get('jstor')) + in_kbart = in_jstor + for archive in ('portico', 'lockss', 'clockss'): + in_kbart = in_kbart or check_kbart(release.year, c_extra['kbart'].get(archive)) + + if c_extra.get('ia'): + if c_extra['ia'].get('sim') and release.year: + in_ia_sim = check_kbart(release, c_extra['ia']['sim'].get('year_spans')) + if c_extra['ia'].get('longtail_oa'): + is_longtail_oa = True + if c_extra.get('sherpa_romeo'): + if c_extra['sherpa_romeo'].get('color') == 'white': + is_oa = False + if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): is_oa = True - if container.extra.get('in_kbart'): - # TODO: better KBART check goes here - in_kbart = True - if container.extra.get('ia'): - # TODO: container longtail check goes here - # TODO: sim/microfilm check goes here - pass - # TODO: SHERPA/Romeo goes here + if c_extra.get('doaj'): + if c_extra['doaj'].get('as_of'): + is_oa = True + if c_extra.get('road'): + if c_extra['road'].get('as_of'): + is_oa = True else: t['publisher'] = release.publisher + if release.jstor_id or (release.doi and release.doi.startswith('10.2307/')): + in_jstor = True + files = release.files or [] t['file_count'] = len(files) t['fileset_count'] = len(release.filesets or []) @@ -118,13 +153,15 @@ def release_to_elasticsearch(release): if url.url.lower().startswith('http'): in_web = True if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): - # TODO: not sure what rel will be + # not sure what rel will be for this stuff in_dweb = True if is_pdf: any_pdf_url = url.url if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: is_preserved = True good_pdf_url = url.url + if '//www.jstor.org/' in url.url: + in_jstor = True if '//web.archive.org/' in url.url or '//archive.org/' in url.url: in_ia = True if is_pdf: @@ -141,18 +178,15 @@ def release_to_elasticsearch(release): extra = release.extra or dict() if extra: - # TODO: longtail OA check from GROBID here - if extra.get('in_kbart'): - # NOTE: not actually setting this anywhere - in_kbart = True if extra.get('is_oa'): - # NOTE: not actually setting this anywhere + # NOTE: not actually setting this anywhere... but could is_oa = True - if extra.get('grobid'): - if not t.get('container_name'): - t['container_name'] = extra['grobid'].get('container_name') - if extra['grobid'].get('longtail_oa'): - is_longtail_oa = True + if extra.get('longtail_oa'): + # sometimes set by GROBID/matcher + is_oa = True + is_longtail_oa = True + if not t.get('container_name'): + t['container_name'] = extra.get('container_name') if extra.get('crossref'): if extra['crossref'].get('archive'): # all crossref archives are KBART, I believe @@ -163,8 +197,101 @@ def release_to_elasticsearch(release): t['is_oa'] = is_oa t['is_longtail_oa'] = is_longtail_oa t['in_kbart'] = in_kbart + t['in_jstor'] = in_jstor t['in_web'] = in_web t['in_dweb'] = in_dweb - t['in_ia'] = in_ia - t['is_preserved'] = in_ia or in_kbart + t['in_ia'] = bool(in_ia) + t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) + return t + +def container_to_elasticsearch(entity): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + Raises exception on error (never returns None) + """ + + if entity.state in ('redirect', 'deleted'): + return dict( + ident = entity.ident, + state = entity.state, + ) + elif entity.state != 'active': + raise ValueError("Unhandled entity state: {}".format(entity.state)) + + # First, the easy ones (direct copy) + t = dict( + ident = entity.ident, + state = entity.state, + revision = entity.revision, + + name = entity.name, + publisher = entity.publisher, + container_type = entity.container_type, + issnl = entity.issnl, + wikidata_qid = entity.wikidata_qid, + + entity_status = entity.entity_status, + language = entity.language, + license = entity.license_slug, + doi = entity.doi, + pmid = entity.pmid, + isbn13 = entity.isbn13, + core_id = entity.core_id, + arxiv_id = entity.core_id, + jstor_id = entity.jstor_id, + ) + + # TODO: region, discipline + # TODO: single primary language? + for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'): + if entity.extra.get(key): + t[key] = entity.extra[key] + + in_doaj = None + in_road = None + # TODO: not currently implemented + in_doi = None + # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid" + #in_doaj_works = None + in_sherpa_romeo = None + is_oa = None + # TODO: not actually set/stored anywhere? + is_longtail_oa = None + any_kbart = None + any_jstor = None + any_ia_sim = None + + extra = entity.extra + if extra.get('doaj'): + if extra['doaj'].get('as_of'): + in_doaj = True + if extra.get('road'): + if extra['road'].get('as_of'): + in_road = True + if extra.get('default_license'): + if extra['default_license'].startswith('CC-'): + is_oa = True + if extra.get('sherpa_romeo'): + in_sherpa_romeo = True + if extra['sherpa_romeo'].get('color') == 'white': + is_oa = False + if extra.get('kbart'): + any_kbart = True + if extra['kbart'].get('jstor'): + any_jstor = True + if extra.get('ia'): + if extra['ia'].get('sim'): + any_ia_sim = True + + t['in_doaj'] = is_doaj + t['in_road'] = is_road + t['in_doi'] = in_doi + t['in_sherpa_romeo'] = in_sherpa_romeo + t['is_oa'] = in_doaj or in_road or is_longtail_oa or ia_oa + t['is_longtail_oa'] = is_longtail_oa + t['any_kbart'] = any_ia_sim + t['any_jstor'] = any_ia_sim + t['any_ia_sim'] = bool(any_ia_sim) return t |