refactor transforms into sub-dir

author: Bryan Newbold <bnewbold@robocracy.org> 2019-03-11 16:38:51 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-03-11 16:38:51 -0700
commit: 655f7060eb5b5e711a8a892cb1085639c4aa8fd2 (patch)
tree: ffa1139e0c56b6510ec71d1aa8cc426423449f11 /python/fatcat_tools/transforms/elasticsearch.py
parent: c937447f894cfde54628fecf3fa71127cb769f0c (diff)
download: fatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.tar.gz
fatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.zip
1 files changed, 327 insertions, 0 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
new file mode 100644
index 00000000..0c2c5e46
--- /dev/null
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -0,0 +1,327 @@
+
+
+import collections
+from fatcat_client import ApiClient
+
+
+def check_kbart(year, archive):
+    if not archive or not archive.get('year_spans'):
+        return None
+    for span in archive['year_spans']:
+        if year >= span[0] and year <= span[1]:
+            return True
+    return False
+
+def test_check_kbart():
+
+    assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) == False
+    assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) == True
+    assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False
+    assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True
+
+def release_to_elasticsearch(entity, force_bool=True):
+    """
+    Converts from an entity model/schema to elasticsearch oriented schema.
+
+    Returns: dict
+    Raises exception on error (never returns None)
+    """
+
+    if entity.state in ('redirect', 'deleted'):
+        return dict(
+            ident = entity.ident,
+            state = entity.state,
+        )
+    elif entity.state != 'active':
+        raise ValueError("Unhandled entity state: {}".format(entity.state))
+
+    # First, the easy ones (direct copy)
+    release = entity
+    t = dict(
+        ident = release.ident,
+        state = release.state,
+        revision = release.revision,
+        title = release.title,
+        original_title = release.original_title,
+        release_type = release.release_type,
+        release_status = release.release_status,
+        language = release.language,
+        license = release.license_slug,
+        doi = release.doi,
+        pmid = release.pmid,
+        pmcid = release.pmcid,
+        isbn13 = release.isbn13,
+        wikidata_qid = release.wikidata_qid,
+        core_id = release.core_id,
+        arxiv_id = release.core_id,
+        jstor_id = release.jstor_id,
+    )
+
+    is_oa = None
+    is_preserved = None
+    is_longtail_oa = None
+    in_kbart = None
+    in_jstor = False
+    in_web = False
+    in_dweb = False
+    in_ia = False
+    in_ia_sim = False
+    in_shadow = False
+
+    release_year = release.release_year
+    if release.release_date:
+        # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
+        t['release_date'] = release.release_date.isoformat()
+        if not release_year:
+            release_year = release.release_date.year
+    if release_year:
+        t['release_year'] = release_year
+
+    t['any_abstract'] = len(release.abstracts) > 0
+    t['ref_count'] = len(release.refs or [])
+    t['contrib_count'] = len(release.contribs or [])
+    contrib_names = []
+    for c in (release.contribs or []):
+        if c.raw_name:
+            contrib_names.append(c.raw_name)
+    t['contrib_names'] = contrib_names
+
+    container = release.container
+    if container:
+        t['publisher'] = container.publisher
+        t['container_name'] = container.name
+        t['container_id'] = container.ident
+        t['container_issnl'] = container.issnl
+        t['container_type'] = container.container_type
+        if container.extra:
+            c_extra = container.extra
+            if c_extra.get('kbart') and release_year:
+                in_jstor = check_kbart(release_year, c_extra['kbart'].get('jstor'))
+                in_kbart = in_jstor
+                for archive in ('portico', 'lockss', 'clockss'):
+                    in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive))
+
+            if c_extra.get('ia'):
+                if c_extra['ia'].get('sim') and release_year:
+                    in_ia_sim = check_kbart(release_year, c_extra['ia']['sim'])
+                if c_extra['ia'].get('longtail_oa'):
+                    is_longtail_oa = True
+            if c_extra.get('sherpa_romeo'):
+                if c_extra['sherpa_romeo'].get('color') == 'white':
+                    is_oa = False
+            if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):
+                is_oa = True
+            if c_extra.get('doaj'):
+                if c_extra['doaj'].get('as_of'):
+                    is_oa = True
+            if c_extra.get('road'):
+                if c_extra['road'].get('as_of'):
+                    is_oa = True
+    else:
+        t['publisher'] = release.publisher
+
+    if release.jstor_id or (release.doi and release.doi.startswith('10.2307/')):
+        in_jstor = True
+
+    files = release.files or []
+    t['file_count'] = len(files)
+    t['fileset_count'] = len(release.filesets or [])
+    t['webcapture_count'] = len(release.webcaptures or [])
+    any_pdf_url = None
+    good_pdf_url = None
+    best_pdf_url = None
+    ia_pdf_url = None
+    for f in files:
+        if f.extra and f.extra.get('shadows'):
+            # TODO: shadow check goes here
+            in_shadows = True
+        is_pdf = 'pdf' in (f.mimetype or '')
+        for url in (f.urls or []):
+            if url.url.lower().startswith('http'):
+                in_web = True
+            if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+                # not sure what rel will be for this stuff
+                in_dweb = True
+            if is_pdf:
+                any_pdf_url = url.url
+            if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
+                is_preserved = True
+                good_pdf_url = url.url
+            if '//www.jstor.org/' in url.url:
+                in_jstor = True
+            if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
+                in_ia = True
+                if is_pdf:
+                    best_pdf_url = url.url
+                    ia_pdf_url = url.url
+    # here is where we bake-in priority; IA-specific
+    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+    t['ia_pdf_url'] = ia_pdf_url
+
+    if release.license_slug:
+        # TODO: more/better checks here, particularly strict *not* OA licenses
+        if release.license_slug.startswith("CC-"):
+            is_oa = True
+
+    extra = release.extra or dict()
+    if extra:
+        if extra.get('is_oa'):
+            # NOTE: not actually setting this anywhere... but could
+            is_oa = True
+        if extra.get('longtail_oa'):
+            # sometimes set by GROBID/matcher
+            is_oa = True
+            is_longtail_oa = True
+        if not t.get('container_name'):
+            t['container_name'] = extra.get('container_name')
+        if extra.get('crossref'):
+            if extra['crossref'].get('archive'):
+                # all crossref archives are KBART, I believe
+                in_kbart = True
+
+    if is_longtail_oa:
+        is_oa = True
+
+    if force_bool:
+        t['is_oa'] = bool(is_oa)
+        t['is_longtail_oa'] = bool(is_longtail_oa)
+        t['in_kbart'] = bool(in_kbart)
+        t['in_ia_sim'] = bool(in_ia_sim)
+        t['in_jstor'] = bool(in_jstor)
+        t['in_web'] = bool(in_web)
+        t['in_dweb'] = bool(in_dweb)
+    else:
+        t['is_oa'] = is_oa
+        t['is_longtail_oa'] = is_longtail_oa
+        t['in_kbart'] = in_kbart
+        t['in_ia_sim'] = in_ia_sim
+        t['in_jstor'] = in_jstor
+        t['in_web'] = in_web
+        t['in_dweb'] = in_dweb
+
+    t['in_ia'] = bool(in_ia)
+    t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
+    return t
+
+def container_to_elasticsearch(entity, force_bool=True):
+    """
+    Converts from an entity model/schema to elasticsearch oriented schema.
+
+    Returns: dict
+    Raises exception on error (never returns None)
+    """
+
+    if entity.state in ('redirect', 'deleted'):
+        return dict(
+            ident = entity.ident,
+            state = entity.state,
+        )
+    elif entity.state != 'active':
+        raise ValueError("Unhandled entity state: {}".format(entity.state))
+
+    # First, the easy ones (direct copy)
+    t = dict(
+        ident = entity.ident,
+        state = entity.state,
+        revision = entity.revision,
+
+        name = entity.name,
+        publisher = entity.publisher,
+        container_type = entity.container_type,
+        issnl = entity.issnl,
+        wikidata_qid = entity.wikidata_qid,
+    )
+
+    # TODO: region, discipline
+    # TODO: single primary language?
+    if not entity.extra:
+        entity.extra = dict()
+    for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'):
+        if entity.extra.get(key):
+            t[key] = entity.extra[key]
+
+    in_doaj = None
+    in_road = None
+    # TODO: not currently implemented
+    in_doi = None
+    # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid"
+    #in_doaj_works = None
+    in_sherpa_romeo = None
+    is_oa = None
+    # TODO: not actually set/stored anywhere?
+    is_longtail_oa = None
+    any_kbart = None
+    any_jstor = None
+    any_ia_sim = None
+
+    extra = entity.extra
+    if extra.get('doaj'):
+        if extra['doaj'].get('as_of'):
+            in_doaj = True
+    if extra.get('road'):
+        if extra['road'].get('as_of'):
+            in_road = True
+    if extra.get('default_license'):
+        if extra['default_license'].startswith('CC-'):
+            is_oa = True
+    if extra.get('sherpa_romeo'):
+        in_sherpa_romeo = True
+        if extra['sherpa_romeo'].get('color') == 'white':
+            is_oa = False
+    if extra.get('kbart'):
+        any_kbart = True
+        if extra['kbart'].get('jstor'):
+            any_jstor = True
+    if extra.get('ia'):
+        if extra['ia'].get('sim'):
+            any_ia_sim = True
+
+    t['in_doaj'] = bool(in_doaj)
+    t['in_road'] = bool(in_road)
+    t['in_sherpa_romeo'] = bool(in_sherpa_romeo)
+    t['any_kbart'] = bool(any_kbart)
+    t['is_longtail_oa'] = bool(is_longtail_oa)
+    if force_bool:
+        t['in_doi'] = bool(in_doi)
+        t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa)
+        t['any_jstor'] = bool(any_jstor)
+        t['any_ia_sim'] = bool(any_ia_sim)
+    else:
+        t['in_doi'] = in_doi
+        t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa
+        t['any_jstor'] = any_jstor
+        t['any_ia_sim'] = any_ia_sim
+    return t
+
+
+def changelog_to_elasticsearch(entity):
+
+    editgroup = entity.editgroup
+    t = dict(
+        index=entity.index,
+        editgroup_id=entity.editgroup_id,
+        timestamp=entity.timestamp,
+        editor_id=editgroup.editor_id,
+    )
+
+    extra = editgroup.extra or dict()
+    if extra.get('agent'):
+        t['agent'] = extra['agent']
+
+    t['containers'] = len(editgroup.edits.containers)
+    t['creators'] = len(editgroup.edits.containers)
+    t['files'] = len(editgroup.edits.containers)
+    t['filesets'] = len(editgroup.edits.containers)
+    t['webcaptures'] = len(editgroup.edits.containers)
+    t['releases'] = len(editgroup.edits.containers)
+    t['works'] = len(editgroup.edits.containers)
+
+    # TODO: parse and pull out counts
+    #created = 0
+    #updated = 0
+    #deleted = 0
+    #t['created'] = created
+    #t['updated'] = updated
+    #t['deleted'] = deleted
+    #t['total'] = created + updated + deleted
+    return t
author	Bryan Newbold <bnewbold@robocracy.org>	2019-03-11 16:38:51 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-03-11 16:38:51 -0700
commit	655f7060eb5b5e711a8a892cb1085639c4aa8fd2 (patch)
tree	ffa1139e0c56b6510ec71d1aa8cc426423449f11 /python/fatcat_tools/transforms/elasticsearch.py
parent	c937447f894cfde54628fecf3fa71127cb769f0c (diff)
download	fatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.tar.gz fatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.zip