first pass transform from pipelines to ES schema

author: Bryan Newbold <bnewbold@archive.org> 2020-05-20 13:27:55 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-05-20 13:27:55 -0700
commit: f2c465fffc76ca752249e11d32673db43efc35f1 (patch)
tree: e000389c916c56c322a984ebdf440a2c6129a0e0 /fatcat_scholar/transform.py
parent: 3ee18580dd108c69c01cdf838a7f1a7d3d181629 (diff)
download: fatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.tar.gz
fatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.zip
1 files changed, 306 insertions, 0 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
new file mode 100644
index 0000000..54d3f71
--- /dev/null
+++ b/fatcat_scholar/transform.py
@@ -0,0 +1,306 @@
+
+import os
+import io
+import sys
+import argparse
+from pydantic import BaseModel, validator
+from typing import List, Dict, Tuple, Optional, Any, Sequence
+from fatcat_openapi_client import ReleaseEntity, FileEntity
+import internetarchive
+
+from fatcat_scholar.api_entities import *
+from fatcat_scholar.djvu import djvu_extract_leaf_texts
+from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
+from fatcat_scholar.issue_db import IssueDB, SimIssueRow
+from fatcat_scholar.schema import *
+from fatcat_scholar.grobid2json import teixml2json
+
+
+def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]:
+    if not sim['page_texts']:
+        return None
+    first_page = sim['page_texts'][0]['page_num']
+    issue_item = sim['issue_item']
+    return ScholarFulltext(
+        lang_code=None, # TODO: pub/issue metadata? or langdetect?
+        body="\n".join([p['raw_text'] for p in sim['page_texts']]),
+        #acknowledgement=None,
+        #annex=None,
+        release_ident=sim.get('release_ident'),
+        #file_ident=None,
+        #file_sha1=None,
+        #file_mimetype=None,
+        thumbnail_url=f"https://archive.org/serve/{issue_item}/__ia_thumb.jpg",
+        access_url=f"https://archive.org/details/{issue_item}/page/{first_page}",
+        access_type=AccessType.ia_sim,
+    )
+
+def es_sim_from_sim(sim: Dict[str, Any]) -> ScholarSim:
+    first_page = None
+    if sim['page_texts']:
+        first_page = sim['page_texts'][0]['page_num']
+    return ScholarSim(
+        issue_item=sim['issue_item'],
+        pub_collection=sim['pub_item_metadata']['metadata']['identifier'],
+        sim_pubid=sim['issue_item_metadata']['metadata']['sim_pubid'],
+        first_page=first_page,
+    )
+
+SIM_RELEASE_TYPE_MAP = {
+    'Scholarly Journals': 'article-journal',
+    # TODO:
+}
+SIM_LANG_MAP = {
+    'English': 'en',
+    # TODO:
+}
+SIM_COUNTRY_MAP = {
+    'Netherlands': 'nl',
+    # TODO:
+}
+
+def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
+
+    issue_meta = sim['issue_item_metadata']['metadata']
+    pub_meta = sim['pub_item_metadata']['metadata']
+
+    first_page = None
+    if sim['page_texts']:
+        first_page = sim['page_texts'][0]['page_num']
+    container_name = sim['pub_item_metadata']['metadata']['title']
+    last_word = container_name.split()[-1]
+    if len(last_word) == 9 and last_word[4] == '-':
+        container_name = container_name[:-10]
+
+    issns = []
+    raw_issn = issue_meta.get('issn')
+    if raw_issn and len(raw_issn) == 9:
+        issns.append(raw_issn)
+
+    volume = issue_meta.get('volume')
+    volume_int = None
+    if volume and volume.isdigit():
+        volume_int = int(volume)
+    issue = issue_meta.get('issue')
+    issue_int = None
+    if issue and issue.isdigit():
+        issue_int = int(issue)
+
+    date = issue_meta.get('date')
+    release_year = None
+    if date and len(date) > 4 and date[:4].isdigit():
+        release_year = int(date[:4])
+
+    release_date = None
+    if len(date) == len("2000-01-01"):
+        release_date = date
+
+    return ScholarBiblio(
+        #release_ident=release.ident,
+        title=None,
+        #subtitle=None,
+        #original_title=release.original_title,
+        release_date=release_date,
+        release_year=release_year,
+        release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get('pub_type')),
+        release_stage="published", # as a default
+        #withdrawn_status=release.withdrawn_status,
+        lang_code=SIM_LANG_MAP.get(pub_meta.get('language')),
+        country_code=SIM_COUNTRY_MAP.get(pub_meta.get('country')),
+        volume=volume,
+        volume_int=volume_int,
+        issue=issue,
+        issue_int=issue_int,
+        pages=sim.get('pages'),
+        first_page=first_page,
+        first_page_int=None,
+        #number=None,
+
+        # no external identifiers
+
+        #license_slug=release.license_slug,
+        publisher=issue_meta.get('publisher'),
+        container_name=container_name,
+        container_original_name=None, # TODO pass-through
+        container_ident=None, # TODO: pass-through
+        container_type=None, # TODO
+        container_issnl=None, # TODO: pass-through
+        issns=issns,
+
+        # no contrib/affiliation info
+        contrib_names=[],
+        affiliations=[],
+    )
+
+def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity) -> ScholarFulltext:
+    best_url = None
+    best_url_type = None
+    for url in fe.urls:
+        best_url = url.url
+        best_url_type = AccessType.web
+        if '//archive.org/' in url.url:
+            best_url_type = AccessType.ia_file
+            break
+        elif '//web.archive.org/' in url.url:
+            best_url_type = AccessType.wayback
+            break
+        if url.rel == "repository":
+            best_url_type = AccessType.repository
+        # TODO: more file-to-access logic
+
+    fulltext.release_ident = re.ident
+    fulltext.file_ident = fe.ident
+    fulltext.file_sha1 = fe.sha1
+    fulltext.file_mimetype = fe.mimetype
+    fulltext.access_url = best_url
+    fulltext.access_type = best_url_type
+    return fulltext
+
+
+def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+    obj = teixml2json(tei_xml)
+    ret = ScholarFulltext(
+        lang_code=obj.get('lang'),
+        body=obj.get('body'),
+        acknowledgement=obj.get('acknowledgement'),
+        annex=obj.get('annex'),
+        thumbnail_url=None, # TODO: sandcrawler thumbnails
+    )
+    return _add_file_release_meta(ret, re, fe)
+
+def es_fulltext_from_pdftotext(pdftotext: Any, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+
+    ret = ScholarFulltext(
+        lang_code=re.language,
+        body=pdftotext['raw_text'],
+        acknowledgement=None,
+        annex=None,
+        thumbnail_url=None, # TODO: sandcrawler thumbnails
+    )
+    return _add_file_release_meta(ret, re, fe)
+
+def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
+
+    tags: List[str] = []
+    work_ident: Optional[str] = None
+    abstracts: List[ScholarAbstract] = []
+    fulltext: Optional[ScholarFulltext]
+
+    ia_sim: Optional[ScholarSim] = None
+    if heavy.sim_fulltext is not None:
+        ia_sim = es_sim_from_sim(heavy.sim_fulltext)
+
+    if heavy.doc_type == DocType.sim_page:
+        assert ia_sim is not None
+        key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}"
+        biblio = es_biblio_from_sim(heavy.sim_fulltext)
+        fulltext = es_fulltext_from_sim(heavy.sim_fulltext)
+    elif heavy.doc_type == DocType.work:
+        work_ident = heavy.releases[0].work_id
+        key = f"work_{work_ident}"
+        assert heavy.biblio_release_ident
+        primary_release = [r for r in heavy.releases if r.ident == heavy.biblio_release_ident][0]
+        biblio = es_biblio_from_release(primary_release)
+
+        # TODO: abstracts from releases also? abstracts_dict?
+        abstracts = es_abstracts_from_release(primary_release)
+    else:
+        raise NotImplementedError(f"doc_type: {heavy.doc_type}")
+
+    if heavy.grobid_fulltext:
+        
+        fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
+        fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
+
+        fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file)
+
+        # hack to pull through thumbnail from local pdftotext
+        if fulltext and not fulltext.thumbnail_url and heavy.pdftotext_fulltext:
+            fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/sha1/{fulltext_file.sha1}" # XXX
+
+    if not fulltext and heavy.pdftotext_fulltext:
+
+        fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
+        fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
+        fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file)
+
+    # TODO: additional access list
+    access_dict = dict()
+    if fulltext and fulltext.access_type:
+        access_dict[fulltext.access_type] = ScholarAccess(
+            access_type=fulltext.access_type,
+            access_url=fulltext.access_url,
+            mimetype=fulltext.file_mimetype,
+            file_ident=fulltext.file_ident,
+            release_ident=fulltext.release_ident,
+        )
+    if ia_sim and not AccessType.ia_sim in access_dict:
+        access_dict[AccessType.ia_sim] = ScholarAccess(
+            access_type=AccessType.ia_sim,
+            access_url=f"https://archive.org/details/{ia_sim.issue_item}/page/{ia_sim.first_page}",
+        )
+
+    # TODO: additional abstracts
+
+    return ScholarDoc(
+        key=key,
+        doc_type=heavy.doc_type.value,
+        doc_index_ts=datetime.datetime.utcnow(),
+        work_ident=work_ident,
+        tags=tags,
+
+        biblio=biblio,
+        fulltext=fulltext,
+        ia_sim=ia_sim,
+        abstracts=abstracts,
+        releases=[es_release_from_release(r) for r in heavy.releases],
+        access=list(access_dict.values()),
+    )
+
+def run_transform(infile):
+    for line in infile:
+        obj = json.loads(line)
+
+        heavy = IntermediateBundle(
+            doc_type=DocType(obj['doc_type']),
+            releases=[entity_from_json(json.dumps(re), ReleaseEntity) for re in obj['releases']],
+            biblio_release_ident=obj.get('biblio_release_ident'),
+            grobid_fulltext=obj.get('grobid_fulltext'),
+            pdftotext_fulltext=obj.get('pdftotext_fulltext'),
+            sim_fulltext=obj.get('sim_fulltext'),
+        )
+        es_doc = transform_heavy(heavy)
+        if not es_doc:
+            continue
+        print(es_doc.json())
+
+def main():
+    """
+    Run this command like:
+
+        python -m fatcat_scholar.transform
+    """
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    subparsers = parser.add_subparsers()
+
+    sub = subparsers.add_parser('run_transform',
+        help="iterates through 'heavy' intermediate")
+    sub.set_defaults(func='run_transform')
+    sub.add_argument("json_file",
+        help="intermediate globs as JSON-lines",
+        nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+
+    args = parser.parse_args()
+    if not args.__dict__.get("func"):
+        print("tell me what to do! (try --help)")
+        sys.exit(-1)
+
+    if args.func == 'run_transform':
+        run_transform(infile=args.json_file)
+    else:
+        raise NotImplementedError(args.func)
+
+if __name__=="__main__":
+    main()
author	Bryan Newbold <bnewbold@archive.org>	2020-05-20 13:27:55 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-05-20 13:27:55 -0700
commit	f2c465fffc76ca752249e11d32673db43efc35f1 (patch)
tree	e000389c916c56c322a984ebdf440a2c6129a0e0 /fatcat_scholar/transform.py
parent	3ee18580dd108c69c01cdf838a7f1a7d3d181629 (diff)
download	fatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.tar.gz fatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.zip