From f2c465fffc76ca752249e11d32673db43efc35f1 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 20 May 2020 13:27:55 -0700
Subject: first pass transform from pipelines to ES schema

---
 fatcat_scholar/es_transform.py  | 316 -------------------------------------
 fatcat_scholar/grobid2json.py   | 201 ++++++++++++++++++++++++
 fatcat_scholar/schema.py        | 334 ++++++++++++++++++++++++++++++++++++++++
 fatcat_scholar/sim_pipeline.py  |  12 +-
 fatcat_scholar/transform.py     | 306 ++++++++++++++++++++++++++++++++++++
 fatcat_scholar/work_pipeline.py |  17 +-
 6 files changed, 850 insertions(+), 336 deletions(-)
 delete mode 100644 fatcat_scholar/es_transform.py
 create mode 100755 fatcat_scholar/grobid2json.py
 create mode 100644 fatcat_scholar/schema.py
 create mode 100644 fatcat_scholar/transform.py

(limited to 'fatcat_scholar')

diff --git a/fatcat_scholar/es_transform.py b/fatcat_scholar/es_transform.py
deleted file mode 100644
index 464b082..0000000
--- a/fatcat_scholar/es_transform.py
+++ /dev/null
@@ -1,316 +0,0 @@
-
-"""
-Originally wrote these as dataclasses using pydantic.dataclasses, but we don't
-get serialization for free with those. This is useful for things like
-auto-conversion of datetime objects.
-"""
-
-import ftfy
-import typing
-import datetime
-from enum import Enum
-from typing import Optional, List
-from xml.etree import cElementTree as ET
-from pydantic import BaseModel
-from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
-
-
-class DocType(str, Enum):
-    work = "work"
-    sim_page = "sim_page"
-
-class AccessType(str, Enum):
-    ia_sim = "ia_sim"
-    ia_file = "ia_file"
-    wayback = "wayback"
-    repository = "repository"
-    paywall = "paywall"
-    loginwall = "loginwall"
-    shadow = "shadow"
-
-class ScholarBiblio(BaseModel):
-    release_ident: Optional[str]
-    title: str
-    subtitle: Optional[str]
-    original_title: Optional[str]
-    release_date: Optional[datetime.date]
-    release_year: Optional[int]
-    release_type: Optional[str]
-    release_stage: Optional[str]
-    withdrawn_status: Optional[str]
-    lang_code: Optional[str]
-    country_code: Optional[str]
-    volume: Optional[str]
-    volume_int: Optional[str]   # TODO: needed?
-    issue: Optional[str]
-    issue_int: Optional[str]    # TODO: needed?
-    pages: Optional[str]
-    first_page: Optional[str]
-    first_page_int: Optional[str] # TODO: needed?
-    number: Optional[str]
-
-    doi: Optional[str]
-    doi_prefix: Optional[str]
-    doi_registrar: Optional[str]
-    pmid: Optional[str]
-    pmcid: Optional[str]
-    isbn13: Optional[str]
-    wikidata_qid: Optional[str]
-    arxiv_id: Optional[str]
-    jstor_id: Optional[str]
-    mag_id: Optional[str]
-
-    license_slug: Optional[str]
-    publisher: Optional[str]
-    publisher_type: Optional[str]
-    container_name: Optional[str]
-    container_original_name: Optional[str]
-    container_ident: Optional[str]
-    container_issnl: Optional[str]
-    container_wikidata_qid: Optional[str]
-    issns: List[str]
-    container_type: Optional[str]
-    contrib_count: Optional[int]
-    contrib_names: List[str]
-    affiliations: List[str]
-
-class ScholarFulltext(BaseModel):
-    lang_code: Optional[str]
-    body: str
-    acknowledgement: Optional[str]
-    annex: Optional[str]
-    release_ident: Optional[str]
-    file_ident: Optional[str]
-    file_sha1: Optional[str]
-    file_mimetype: Optional[str]
-    thumbnail_url: Optional[str]
-    access_url: Optional[str]
-    access_type: Optional[AccessType]
-
-class ScholarRelease(BaseModel):
-    ident: Optional[str]
-    revision: Optional[str]
-    title: str
-    release_date: Optional[datetime.date]
-    release_year: Optional[int]
-    release_type: Optional[str]
-    release_stage: Optional[str]
-    withdrawn_status: Optional[str]
-
-    doi: Optional[str]
-    doi_prefix: Optional[str]
-    doi_registrar: Optional[str]
-    pmid: Optional[str]
-    pmcid: Optional[str]
-    isbn13: Optional[str]
-    wikidata_qid: Optional[str]
-    arxiv_id: Optional[str]
-    jstor_id: Optional[str]
-    mag_id: Optional[str]
-
-    license_slug: Optional[str]
-    container_name: Optional[str]
-    container_ident: Optional[str]
-    container_issnl: Optional[str]
-    container_type: Optional[str]
-
-class ScholarSim(BaseModel):
-    issue_item: str
-    pub_collection: str
-    sim_pubid: str
-    first_page: Optional[str]
-
-class ScholarAbstract(BaseModel):
-    body: str
-    lang_code: Optional[str]
-
-class ScholarAccess(BaseModel):
-    access_type: AccessType
-    access_url: str
-    mimetype: str
-    file_ident: Optional[str]
-    release_ident: Optional[str]
-
-class ScholarDoc(BaseModel):
-    key: str
-    doc_type: str # enum: work or page
-    doc_index_ts: datetime.datetime
-    work_ident: Optional[str]
-    tags: List[str] = []
-
-    biblio: ScholarBiblio
-    fulltext: ScholarFulltext
-    ia_sim: ScholarSim
-    abstracts: List[ScholarAbstract]
-    releases: List[ScholarRelease]
-    access: List[ScholarAccess]
-
-def doi_split_prefix(doi: str) -> str:
-    return doi.split('/')[0]
-
-def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
-    if not release.ext_ids.doi or not release.extra:
-        return None
-    for registrar in ('crossref', 'datacite', 'jalc'):
-        if registrar in release.extra:
-            return registrar
-    # TODO: should we default to Crossref?
-    return None
-
-def scrub_text(raw: str, mimetype: str = None) -> str:
-    """
-    This function takes a mimetype-hinted string and tries to reduce it to a
-    simple token-and-punctuation scheme with any and all markup removed. Eg,
-    HTML tags, JATS XML tags, LaTeX, whatever.
-
-    The output should be clean and "HTML safe" (though should still be escaped
-    in HTML to get entity encoding correct).
-
-    TODO: barely implemented yet
-    """
-    if "<jats" in raw or (mimetype and "application/xml" in mimetype):
-        root = ET.fromstring(raw)
-        raw = " ".join(list(root.itertext())) or ""
-    raw = ftfy.fix_text(raw)
-    assert raw, "Empty abstract"
-    return raw
-
-def contrib_name(contrib: ReleaseContrib) -> str:
-    # TODO: support more cultural normals for name presentation
-    if contrib.given_name and contrib.family_name:
-        return f"{contrib.given_name} {contrib.family_name}"
-    elif contrib.raw_name:
-        return contrib.raw_name
-    elif contrib.family_name:
-        return contrib.family_name
-    else:
-        return contrib.given_name
-
-def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
-    return None
-
-def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
-
-    d = dict()
-    for abst in release.abstracts:
-        if not abst.lang in d:
-            d[abst.lang] = ScholarAbstract(lang_code=abst.lang, body=scrub_text(abst.content))
-    return list(d.values())
-
-def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
-
-    if release.container:
-        publisher = release.publisher
-        container_name = release.container.name
-        container_original_name = release.container.extra and release.container.extra.get('original_name')
-        container_ident = release.container.ident
-        container_type = release.container.container_type
-        container_issnl = release.container.issnl
-        issns = [container_issnl,]
-        if release.extra.get('issne'):
-            issns.append(release.extra['issne'])
-        if release.extra.get('issnp'):
-            issns.append(release.extra['issnp'])
-        issns = list(set(issns))
-    else:
-        publisher = release.extra.get('publisher')
-        container_name = release.extra.get('container_name')
-        container_original_name = None
-        container_ident = None
-        container_type = None
-        container_issnl = None
-        issns = []
-
-    first_page: Optional[str] = None
-    if release.pages:
-        first_page = release.pages.split('-')[0]
-    first_page_int: Optional[int] = None
-    if first_page and first_page.isdigit():
-        first_page_int = int(first_page)
-
-    ret = ScholarBiblio(
-        release_ident=release.ident,
-        title=release.title,
-        subtitle=release.subtitle,
-        original_title=release.original_title,
-        release_date=release.release_date,
-        release_year=release.release_year,
-        release_type=release.release_type,
-        release_stage=release.release_stage,
-        withdrawn_status=release.withdrawn_status,
-        lang_code=release.language,
-        country_code=release.extra and release.extra.get('country'),
-        volume=release.volume,
-        volume_int=None,
-        issue=release.issue,
-        issue_int=None,
-        pages=release.pages,
-        first_page=first_page,
-        first_page_int=None,
-        number=release.number,
-
-        doi=release.ext_ids.doi,
-        doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
-        doi_registrar=release_doi_registrar(release),
-        pmid=release.ext_ids.pmid,
-        pmcid=release.ext_ids.pmcid,
-        isbn13=release.ext_ids.isbn13,
-        wikidata_qid=release.ext_ids.wikidata_qid,
-        arxiv_id=release.ext_ids.arxiv,
-        jstor_id=release.ext_ids.jstor,
-        mag_id=release.ext_ids.mag,
-
-        license_slug=release.license_slug,
-        publisher=publisher,
-        container_name=container_name,
-        container_original_name=container_original_name,
-        container_ident=container_ident,
-        container_type=container_type,
-        container_issnl=container_issnl,
-        issns=issns,
-
-        contrib_names=[contrib_name(c) for c in release.contribs if c.index],
-        contrib_count = len([c for c in release.contribs if c.index]),
-        affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])),
-    )
-    return ret
-
-def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
-
-    if release.container:
-        container_name = release.container.name
-        container_ident = release.container.ident
-        container_issnl = release.container.issnl
-        container_type = release.container.container_type
-    else:
-        container_name = release.extra.get('container_name')
-        container_ident = None
-        container_issnl = None
-        container_type = None
-
-    ret = ScholarRelease(
-        ident=release.ident,
-        revision=release.revision,
-        title=release.title,
-        release_date=release.release_date,
-        release_year=release.release_year,
-        release_type=release.release_type,
-        release_stage=release.release_stage,
-        withdrawn_status=release.withdrawn_status,
-        doi=release.ext_ids.doi,
-        doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
-        doi_registrar=release_doi_registrar(release),
-        pmid=release.ext_ids.pmid,
-        pmcid=release.ext_ids.pmcid,
-        isbn13=release.ext_ids.isbn13,
-        wikidata_qid=release.ext_ids.wikidata_qid,
-        arxiv_id=release.ext_ids.arxiv,
-        jstor_id=release.ext_ids.jstor,
-        mag_id=release.ext_ids.mag,
-        license_slug=release.license_slug,
-        container_name=container_name,
-        container_ident=container_ident,
-        container_issnl=container_issnl,
-        container_type=container_type,
-    )
-    return ret
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py
new file mode 100755
index 0000000..9c2ffad
--- /dev/null
+++ b/fatcat_scholar/grobid2json.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+
+"""
+NB: adapted to work as a library for PDF extraction. Will probably be
+re-written eventually to be correct, complete, and robust; this is just a
+first iteration.
+
+This script tries to extract everything from a GROBID TEI XML fulltext dump:
+
+- header metadata
+- affiliations
+- references (with context)
+- abstract
+- fulltext
+- tables, figures, equations
+
+A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
+
+- abstract
+- fulltext
+- tables, figures, equations
+
+Prints JSON to stdout, errors to stderr
+
+This file copied from the sandcrawler repository.
+"""
+
+import io
+import json
+import argparse
+import xml.etree.ElementTree as ET
+
+xml_ns = "http://www.w3.org/XML/1998/namespace"
+ns = "http://www.tei-c.org/ns/1.0"
+
+def all_authors(elem):
+    names = []
+    for author in elem.findall('.//{%s}author' % ns):
+        pn = author.find('./{%s}persName' % ns)
+        if not pn:
+            continue
+        given_name = pn.findtext('./{%s}forename' % ns) or None
+        surname = pn.findtext('./{%s}surname' % ns) or None
+        full_name = ' '.join(pn.itertext())
+        obj = dict(name=full_name)
+        if given_name:
+            obj['given_name'] = given_name
+        if surname:
+            obj['surname'] = surname
+        ae = author.find('./{%s}affiliation' % ns)
+        if ae:
+            affiliation = dict()
+            for on in ae.findall('./{%s}orgName' % ns):
+                affiliation[on.get('type')] = on.text
+            addr_e = ae.find('./{%s}address' % ns)
+            if addr_e:
+                address = dict()
+                for t in addr_e.getchildren():
+                    address[t.tag.split('}')[-1]] = t.text
+                if address:
+                    affiliation['address'] = address
+                #affiliation['address'] = {
+                #    'post_code': addr.findtext('./{%s}postCode' % ns) or None,
+                #    'settlement': addr.findtext('./{%s}settlement' % ns) or None,
+                #    'country': addr.findtext('./{%s}country' % ns) or None,
+                #}
+            obj['affiliation'] = affiliation
+        names.append(obj)
+    return names
+
+
+def journal_info(elem):
+    journal = dict()
+    journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
+    journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+    if journal['publisher'] == '':
+        journal['publisher'] = None
+    journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
+    journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
+    journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+    journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+    keys = list(journal.keys())
+
+    # remove empty/null keys
+    for k in keys:
+        if not journal[k]:
+            journal.pop(k)
+    return journal
+
+
+def biblio_info(elem):
+    ref = dict()
+    ref['id'] = elem.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
+    # Title stuff is messy in references...
+    ref['title'] = elem.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
+    other_title = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
+    if other_title:
+        if ref['title']:
+            ref['journal'] = other_title
+        else:
+            ref['journal'] = None
+            ref['title'] = other_title
+    ref['authors'] = all_authors(elem)
+    ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+    if ref['publisher'] == '':
+        ref['publisher'] = None
+    date = elem.find('.//{%s}date[@type="published"]' % ns)
+    ref['date'] = (date != None) and date.attrib.get('when')
+    ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+    ref['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+    el = elem.find('.//{%s}ptr[@target]' % ns)
+    if el is not None:
+        ref['url'] = el.attrib['target']
+        # Hand correction
+        if ref['url'].endswith(".Lastaccessed"):
+            ref['url'] = ref['url'].replace(".Lastaccessed", "")
+    else:
+        ref['url'] = None
+    return ref
+
+
+def teixml2json(content, encumbered=True):
+
+    if type(content) == str:
+        content = io.StringIO(content)
+    elif type(content) == bytes:
+        content = io.BytesIO(content)
+
+    info = dict()
+
+    #print(content)
+    #print(content.getvalue())
+    tree = ET.parse(content)
+    tei = tree.getroot()
+
+    header = tei.find('.//{%s}teiHeader' % ns)
+    if header is None:
+        raise ValueError("XML does not look like TEI format")
+    application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0]
+    info['grobid_version'] = application_tag.attrib['version'].strip()
+    info['grobid_timestamp'] = application_tag.attrib['when'].strip()
+    info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
+    info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns)))
+    info['journal'] = journal_info(header)
+    date = header.find('.//{%s}date[@type="published"]' % ns)
+    info['date'] = (date != None) and date.attrib.get('when')
+    info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
+    info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
+    if info['doi']:
+        info['doi'] = info['doi'].lower()
+
+    refs = []
+    for (i, bs) in enumerate(tei.findall('.//{%s}listBibl/{%s}biblStruct' % (ns, ns))):
+        ref = biblio_info(bs)
+        ref['index'] = i
+        refs.append(ref)
+    info['citations'] = refs
+
+    text = tei.find('.//{%s}text' % (ns))
+    #print(text.attrib)
+    if text.attrib.get('{%s}lang' % xml_ns):
+        info['language_code'] = text.attrib['{%s}lang' % xml_ns]  # xml:lang
+
+    if encumbered:
+        el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns))
+        info['abstract'] = (el or None) and " ".join(el.itertext()).strip()
+        el = tei.find('.//{%s}text/{%s}body' % (ns, ns))
+        info['body'] = (el or None) and " ".join(el.itertext()).strip()
+        el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
+        info['acknowledgement'] = (el or None) and " ".join(el.itertext()).strip()
+        el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
+        info['annex'] = (el or None) and " ".join(el.itertext()).strip()
+
+    # remove empty/null keys
+    keys = list(info.keys())
+    for k in keys:
+        if not info[k]:
+            info.pop(k)
+    return info
+
+def main():   # pragma no cover
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="GROBID TEI XML to JSON",
+        usage="%(prog)s [options] <teifile>...")
+    parser.add_argument("--no-encumbered",
+        action="store_true",
+        help="don't include ambiguously copyright encumbered fields (eg, abstract, body)")
+    parser.add_argument("teifiles", nargs='+')
+
+    args = parser.parse_args()
+
+    for filename in args.teifiles:
+        content = open(filename, 'r')
+        print(json.dumps(
+            teixml2json(content,
+               encumbered=(not args.no_encumbered)),
+            sort_keys=True))
+
+if __name__=='__main__':   # pragma no cover
+    main()
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
new file mode 100644
index 0000000..aa4ed52
--- /dev/null
+++ b/fatcat_scholar/schema.py
@@ -0,0 +1,334 @@
+
+"""
+Originally wrote these as dataclasses using pydantic.dataclasses, but we don't
+get serialization for free with those. This is useful for things like
+auto-conversion of datetime objects.
+"""
+
+import ftfy
+import datetime
+from enum import Enum
+from typing import Optional, List, Any
+from xml.etree import cElementTree as ET
+from pydantic import BaseModel
+
+from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
+from fatcat_scholar.api_entities import entity_to_dict
+
+
+class DocType(str, Enum):
+    work = "work"
+    sim_page = "sim_page"
+
+class IntermediateBundle(BaseModel):
+    doc_type: DocType
+    releases: List[ReleaseEntity]
+    biblio_release_ident: Optional[str]
+    grobid_fulltext: Optional[Any]
+    pdftotext_fulltext: Optional[Any]
+    sim_fulltext: Optional[Any]
+
+    class Config:
+        arbitrary_types_allowed = True
+        json_encoders = {
+            ReleaseEntity: lambda re: entity_to_dict(re),
+        }
+
+
+class AccessType(str, Enum):
+    ia_sim = "ia_sim"
+    ia_file = "ia_file"
+    wayback = "wayback"
+    web = "web"
+    repository = "repository"
+    paywall = "paywall"
+    loginwall = "loginwall"
+    shadow = "shadow"
+
+class ScholarBiblio(BaseModel):
+    release_ident: Optional[str]
+    title: Optional[str]
+    subtitle: Optional[str]
+    original_title: Optional[str]
+    release_date: Optional[datetime.date]
+    release_year: Optional[int]
+    release_type: Optional[str]
+    release_stage: Optional[str]
+    withdrawn_status: Optional[str]
+    lang_code: Optional[str]
+    country_code: Optional[str]
+    volume: Optional[str]
+    volume_int: Optional[str]   # TODO: needed?
+    issue: Optional[str]
+    issue_int: Optional[str]    # TODO: needed?
+    pages: Optional[str]
+    first_page: Optional[str]
+    first_page_int: Optional[str] # TODO: needed?
+    number: Optional[str]
+
+    doi: Optional[str]
+    doi_prefix: Optional[str]
+    doi_registrar: Optional[str]
+    pmid: Optional[str]
+    pmcid: Optional[str]
+    isbn13: Optional[str]
+    wikidata_qid: Optional[str]
+    arxiv_id: Optional[str]
+    jstor_id: Optional[str]
+    mag_id: Optional[str]
+
+    license_slug: Optional[str]
+    publisher: Optional[str]
+    publisher_type: Optional[str]
+    container_name: Optional[str]
+    container_original_name: Optional[str]
+    container_ident: Optional[str]
+    container_issnl: Optional[str]
+    container_wikidata_qid: Optional[str]
+    issns: List[str]
+    container_type: Optional[str]
+    contrib_count: Optional[int]
+    contrib_names: List[str]
+    affiliations: List[str]
+
+class ScholarFulltext(BaseModel):
+    lang_code: Optional[str]
+    body: str
+    acknowledgement: Optional[str]
+    annex: Optional[str]
+    release_ident: Optional[str]
+    file_ident: Optional[str]
+    file_sha1: Optional[str]
+    file_mimetype: Optional[str]
+    thumbnail_url: Optional[str]
+    access_url: Optional[str]
+    access_type: Optional[AccessType]
+
+class ScholarRelease(BaseModel):
+    ident: Optional[str]
+    revision: Optional[str]
+    title: str
+    release_date: Optional[datetime.date]
+    release_year: Optional[int]
+    release_type: Optional[str]
+    release_stage: Optional[str]
+    withdrawn_status: Optional[str]
+
+    doi: Optional[str]
+    doi_prefix: Optional[str]
+    doi_registrar: Optional[str]
+    pmid: Optional[str]
+    pmcid: Optional[str]
+    isbn13: Optional[str]
+    wikidata_qid: Optional[str]
+    arxiv_id: Optional[str]
+    jstor_id: Optional[str]
+    mag_id: Optional[str]
+
+    license_slug: Optional[str]
+    container_name: Optional[str]
+    container_ident: Optional[str]
+    container_issnl: Optional[str]
+    container_type: Optional[str]
+
+class ScholarSim(BaseModel):
+    issue_item: str
+    pub_collection: str
+    sim_pubid: str
+    first_page: Optional[str]
+
+class ScholarAbstract(BaseModel):
+    body: str
+    lang_code: Optional[str]
+
+class ScholarAccess(BaseModel):
+    access_type: AccessType
+    access_url: str
+    mimetype: Optional[str]
+    file_ident: Optional[str]
+    release_ident: Optional[str]
+
+class ScholarDoc(BaseModel):
+    key: str
+    doc_type: str # enum: work or page
+    doc_index_ts: datetime.datetime
+    work_ident: Optional[str]
+    tags: List[str] = []
+
+    biblio: ScholarBiblio
+    fulltext: Optional[ScholarFulltext]
+    ia_sim: Optional[ScholarSim]
+    abstracts: List[ScholarAbstract]
+    releases: List[ScholarRelease]
+    access: List[ScholarAccess]
+
+def doi_split_prefix(doi: str) -> str:
+    return doi.split('/')[0]
+
+def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
+    if not release.ext_ids.doi or not release.extra:
+        return None
+    for registrar in ('crossref', 'datacite', 'jalc'):
+        if registrar in release.extra:
+            return registrar
+    # TODO: should we default to Crossref?
+    return None
+
+def scrub_text(raw: str, mimetype: str = None) -> str:
+    """
+    This function takes a mimetype-hinted string and tries to reduce it to a
+    simple token-and-punctuation scheme with any and all markup removed. Eg,
+    HTML tags, JATS XML tags, LaTeX, whatever.
+
+    The output should be clean and "HTML safe" (though should still be escaped
+    in HTML to get entity encoding correct).
+
+    TODO: barely implemented yet
+    """
+    if "<jats" in raw or (mimetype and "application/xml" in mimetype):
+        root = ET.fromstring(raw)
+        raw = " ".join(list(root.itertext())) or ""
+    raw = ftfy.fix_text(raw)
+    assert raw, "Empty abstract"
+    return raw
+
+def contrib_name(contrib: ReleaseContrib) -> str:
+    # TODO: support more cultural normals for name presentation
+    if contrib.given_name and contrib.family_name:
+        return f"{contrib.given_name} {contrib.family_name}"
+    elif contrib.raw_name:
+        return contrib.raw_name
+    elif contrib.family_name:
+        return contrib.family_name
+    else:
+        return contrib.given_name
+
+def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
+    # TODO
+    return None
+
+def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
+
+    d = dict()
+    for abst in release.abstracts:
+        if not abst.lang in d:
+            d[abst.lang] = ScholarAbstract(lang_code=abst.lang, body=scrub_text(abst.content))
+    return list(d.values())
+
+def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
+
+    if release.container:
+        publisher = release.publisher
+        container_name = release.container.name
+        container_original_name = release.container.extra and release.container.extra.get('original_name')
+        container_ident = release.container.ident
+        container_type = release.container.container_type
+        container_issnl = release.container.issnl
+        issns = [container_issnl,]
+        if release.extra.get('issne'):
+            issns.append(release.extra['issne'])
+        if release.extra.get('issnp'):
+            issns.append(release.extra['issnp'])
+        issns = list(set(issns))
+    else:
+        publisher = release.extra.get('publisher')
+        container_name = release.extra.get('container_name')
+        container_original_name = None
+        container_ident = None
+        container_type = None
+        container_issnl = None
+        issns = []
+
+    first_page: Optional[str] = None
+    if release.pages:
+        first_page = release.pages.split('-')[0]
+    first_page_int: Optional[int] = None
+    if first_page and first_page.isdigit():
+        first_page_int = int(first_page)
+
+    ret = ScholarBiblio(
+        release_ident=release.ident,
+        title=release.title,
+        subtitle=release.subtitle,
+        original_title=release.original_title,
+        release_date=release.release_date,
+        release_year=release.release_year,
+        release_type=release.release_type,
+        release_stage=release.release_stage,
+        withdrawn_status=release.withdrawn_status,
+        lang_code=release.language,
+        country_code=release.extra and release.extra.get('country'),
+        volume=release.volume,
+        volume_int=None,
+        issue=release.issue,
+        issue_int=None,
+        pages=release.pages,
+        first_page=first_page,
+        first_page_int=None,
+        number=release.number,
+
+        doi=release.ext_ids.doi,
+        doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
+        doi_registrar=release_doi_registrar(release),
+        pmid=release.ext_ids.pmid,
+        pmcid=release.ext_ids.pmcid,
+        isbn13=release.ext_ids.isbn13,
+        wikidata_qid=release.ext_ids.wikidata_qid,
+        arxiv_id=release.ext_ids.arxiv,
+        jstor_id=release.ext_ids.jstor,
+        mag_id=release.ext_ids.mag,
+
+        license_slug=release.license_slug,
+        publisher=publisher,
+        container_name=container_name,
+        container_original_name=container_original_name,
+        container_ident=container_ident,
+        container_type=container_type,
+        container_issnl=container_issnl,
+        issns=issns,
+
+        contrib_names=[contrib_name(c) for c in release.contribs if c.index],
+        contrib_count = len([c for c in release.contribs if c.index]),
+        affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])),
+    )
+    return ret
+
+def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
+
+    if release.container:
+        container_name = release.container.name
+        container_ident = release.container.ident
+        container_issnl = release.container.issnl
+        container_type = release.container.container_type
+    else:
+        container_name = release.extra.get('container_name')
+        container_ident = None
+        container_issnl = None
+        container_type = None
+
+    ret = ScholarRelease(
+        ident=release.ident,
+        revision=release.revision,
+        title=release.title,
+        release_date=release.release_date,
+        release_year=release.release_year,
+        release_type=release.release_type,
+        release_stage=release.release_stage,
+        withdrawn_status=release.withdrawn_status,
+        doi=release.ext_ids.doi,
+        doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
+        doi_registrar=release_doi_registrar(release),
+        pmid=release.ext_ids.pmid,
+        pmcid=release.ext_ids.pmcid,
+        isbn13=release.ext_ids.isbn13,
+        wikidata_qid=release.ext_ids.wikidata_qid,
+        arxiv_id=release.ext_ids.arxiv,
+        jstor_id=release.ext_ids.jstor,
+        mag_id=release.ext_ids.mag,
+        license_slug=release.license_slug,
+        container_name=container_name,
+        container_ident=container_ident,
+        container_issnl=container_issnl,
+        container_type=container_type,
+    )
+    return ret
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index 1dd6476..4315e70 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -13,8 +13,7 @@ from fatcat_scholar.api_entities import *
 from fatcat_scholar.djvu import djvu_extract_leaf_texts
 from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
 from fatcat_scholar.issue_db import IssueDB, SimIssueRow
-from fatcat_scholar.es_transform import es_biblio_from_release, es_release_from_release, DocType
-from fatcat_scholar.work_pipeline import IntermediateBundle
+from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle
 
 
 def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]:
@@ -52,8 +51,9 @@ class SimPipeline():
         issue_item 
         pages: str
         page_texts: list
-            page_number
             raw_text
+            page_num
+            leaf_num
         release_ident: Optional[str]
         pub_item_metadata
         issue_item_metadata
@@ -107,6 +107,10 @@ class SimPipeline():
         self.issue_db.db.row_factory = sqlite3.Row
         cur = self.issue_db.db.cursor()
         for row in cur.execute('SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3'):
+            # filter out "contents" and "index" items
+            # TODO: more filters; also redundant with IssueDB code?
+            if row['issue_item'].endswith('_contents') or row['issue_item'].endswith('_index'):
+                continue
             full_issue = self.fetch_sim_issue(row)
             if not full_issue:
                 continue
@@ -120,7 +124,7 @@ class SimPipeline():
                     sim_fulltext=dict(
                         issue_item=full_issue['issue_item'],
                         pages=str(leaf['page_num']),
-                        page_texts=[leaf['raw_text']],
+                        page_texts=[leaf],
                         release_ident=None,
                         pub_item_metadata=full_issue['pub_item_metadata'],
                         issue_item_metadata=full_issue['issue_item_metadata'],
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
new file mode 100644
index 0000000..54d3f71
--- /dev/null
+++ b/fatcat_scholar/transform.py
@@ -0,0 +1,306 @@
+
+import os
+import io
+import sys
+import argparse
+from pydantic import BaseModel, validator
+from typing import List, Dict, Tuple, Optional, Any, Sequence
+from fatcat_openapi_client import ReleaseEntity, FileEntity
+import internetarchive
+
+from fatcat_scholar.api_entities import *
+from fatcat_scholar.djvu import djvu_extract_leaf_texts
+from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
+from fatcat_scholar.issue_db import IssueDB, SimIssueRow
+from fatcat_scholar.schema import *
+from fatcat_scholar.grobid2json import teixml2json
+
+
+def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]:
+    if not sim['page_texts']:
+        return None
+    first_page = sim['page_texts'][0]['page_num']
+    issue_item = sim['issue_item']
+    return ScholarFulltext(
+        lang_code=None, # TODO: pub/issue metadata? or langdetect?
+        body="\n".join([p['raw_text'] for p in sim['page_texts']]),
+        #acknowledgement=None,
+        #annex=None,
+        release_ident=sim.get('release_ident'),
+        #file_ident=None,
+        #file_sha1=None,
+        #file_mimetype=None,
+        thumbnail_url=f"https://archive.org/serve/{issue_item}/__ia_thumb.jpg",
+        access_url=f"https://archive.org/details/{issue_item}/page/{first_page}",
+        access_type=AccessType.ia_sim,
+    )
+
+def es_sim_from_sim(sim: Dict[str, Any]) -> ScholarSim:
+    first_page = None
+    if sim['page_texts']:
+        first_page = sim['page_texts'][0]['page_num']
+    return ScholarSim(
+        issue_item=sim['issue_item'],
+        pub_collection=sim['pub_item_metadata']['metadata']['identifier'],
+        sim_pubid=sim['issue_item_metadata']['metadata']['sim_pubid'],
+        first_page=first_page,
+    )
+
+SIM_RELEASE_TYPE_MAP = {
+    'Scholarly Journals': 'article-journal',
+    # TODO:
+}
+SIM_LANG_MAP = {
+    'English': 'en',
+    # TODO:
+}
+SIM_COUNTRY_MAP = {
+    'Netherlands': 'nl',
+    # TODO:
+}
+
+def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
+
+    issue_meta = sim['issue_item_metadata']['metadata']
+    pub_meta = sim['pub_item_metadata']['metadata']
+
+    first_page = None
+    if sim['page_texts']:
+        first_page = sim['page_texts'][0]['page_num']
+    container_name = sim['pub_item_metadata']['metadata']['title']
+    last_word = container_name.split()[-1]
+    if len(last_word) == 9 and last_word[4] == '-':
+        container_name = container_name[:-10]
+
+    issns = []
+    raw_issn = issue_meta.get('issn')
+    if raw_issn and len(raw_issn) == 9:
+        issns.append(raw_issn)
+
+    volume = issue_meta.get('volume')
+    volume_int = None
+    if volume and volume.isdigit():
+        volume_int = int(volume)
+    issue = issue_meta.get('issue')
+    issue_int = None
+    if issue and issue.isdigit():
+        issue_int = int(issue)
+
+    date = issue_meta.get('date')
+    release_year = None
+    if date and len(date) > 4 and date[:4].isdigit():
+        release_year = int(date[:4])
+
+    release_date = None
+    if len(date) == len("2000-01-01"):
+        release_date = date
+
+    return ScholarBiblio(
+        #release_ident=release.ident,
+        title=None,
+        #subtitle=None,
+        #original_title=release.original_title,
+        release_date=release_date,
+        release_year=release_year,
+        release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get('pub_type')),
+        release_stage="published", # as a default
+        #withdrawn_status=release.withdrawn_status,
+        lang_code=SIM_LANG_MAP.get(pub_meta.get('language')),
+        country_code=SIM_COUNTRY_MAP.get(pub_meta.get('country')),
+        volume=volume,
+        volume_int=volume_int,
+        issue=issue,
+        issue_int=issue_int,
+        pages=sim.get('pages'),
+        first_page=first_page,
+        first_page_int=None,
+        #number=None,
+
+        # no external identifiers
+
+        #license_slug=release.license_slug,
+        publisher=issue_meta.get('publisher'),
+        container_name=container_name,
+        container_original_name=None, # TODO pass-through
+        container_ident=None, # TODO: pass-through
+        container_type=None, # TODO
+        container_issnl=None, # TODO: pass-through
+        issns=issns,
+
+        # no contrib/affiliation info
+        contrib_names=[],
+        affiliations=[],
+    )
+
+def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity) -> ScholarFulltext:
+    best_url = None
+    best_url_type = None
+    for url in fe.urls:
+        best_url = url.url
+        best_url_type = AccessType.web
+        if '//archive.org/' in url.url:
+            best_url_type = AccessType.ia_file
+            break
+        elif '//web.archive.org/' in url.url:
+            best_url_type = AccessType.wayback
+            break
+        if url.rel == "repository":
+            best_url_type = AccessType.repository
+        # TODO: more file-to-access logic
+
+    fulltext.release_ident = re.ident
+    fulltext.file_ident = fe.ident
+    fulltext.file_sha1 = fe.sha1
+    fulltext.file_mimetype = fe.mimetype
+    fulltext.access_url = best_url
+    fulltext.access_type = best_url_type
+    return fulltext
+
+
+def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+    obj = teixml2json(tei_xml)
+    ret = ScholarFulltext(
+        lang_code=obj.get('lang'),
+        body=obj.get('body'),
+        acknowledgement=obj.get('acknowledgement'),
+        annex=obj.get('annex'),
+        thumbnail_url=None, # TODO: sandcrawler thumbnails
+    )
+    return _add_file_release_meta(ret, re, fe)
+
+def es_fulltext_from_pdftotext(pdftotext: Any, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+
+    ret = ScholarFulltext(
+        lang_code=re.language,
+        body=pdftotext['raw_text'],
+        acknowledgement=None,
+        annex=None,
+        thumbnail_url=None, # TODO: sandcrawler thumbnails
+    )
+    return _add_file_release_meta(ret, re, fe)
+
+def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
+
+    tags: List[str] = []
+    work_ident: Optional[str] = None
+    abstracts: List[ScholarAbstract] = []
+    fulltext: Optional[ScholarFulltext]
+
+    ia_sim: Optional[ScholarSim] = None
+    if heavy.sim_fulltext is not None:
+        ia_sim = es_sim_from_sim(heavy.sim_fulltext)
+
+    if heavy.doc_type == DocType.sim_page:
+        assert ia_sim is not None
+        key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}"
+        biblio = es_biblio_from_sim(heavy.sim_fulltext)
+        fulltext = es_fulltext_from_sim(heavy.sim_fulltext)
+    elif heavy.doc_type == DocType.work:
+        work_ident = heavy.releases[0].work_id
+        key = f"work_{work_ident}"
+        assert heavy.biblio_release_ident
+        primary_release = [r for r in heavy.releases if r.ident == heavy.biblio_release_ident][0]
+        biblio = es_biblio_from_release(primary_release)
+
+        # TODO: abstracts from releases also? abstracts_dict?
+        abstracts = es_abstracts_from_release(primary_release)
+    else:
+        raise NotImplementedError(f"doc_type: {heavy.doc_type}")
+
+    if heavy.grobid_fulltext:
+        
+        fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
+        fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
+
+        fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file)
+
+        # hack to pull through thumbnail from local pdftotext
+        if fulltext and not fulltext.thumbnail_url and heavy.pdftotext_fulltext:
+            fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/sha1/{fulltext_file.sha1}" # XXX
+
+    if not fulltext and heavy.pdftotext_fulltext:
+
+        fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
+        fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
+        fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file)
+
+    # TODO: additional access list
+    access_dict = dict()
+    if fulltext and fulltext.access_type:
+        access_dict[fulltext.access_type] = ScholarAccess(
+            access_type=fulltext.access_type,
+            access_url=fulltext.access_url,
+            mimetype=fulltext.file_mimetype,
+            file_ident=fulltext.file_ident,
+            release_ident=fulltext.release_ident,
+        )
+    if ia_sim and not AccessType.ia_sim in access_dict:
+        access_dict[AccessType.ia_sim] = ScholarAccess(
+            access_type=AccessType.ia_sim,
+            access_url=f"https://archive.org/details/{ia_sim.issue_item}/page/{ia_sim.first_page}",
+        )
+
+    # TODO: additional abstracts
+
+    return ScholarDoc(
+        key=key,
+        doc_type=heavy.doc_type.value,
+        doc_index_ts=datetime.datetime.utcnow(),
+        work_ident=work_ident,
+        tags=tags,
+
+        biblio=biblio,
+        fulltext=fulltext,
+        ia_sim=ia_sim,
+        abstracts=abstracts,
+        releases=[es_release_from_release(r) for r in heavy.releases],
+        access=list(access_dict.values()),
+    )
+
+def run_transform(infile):
+    for line in infile:
+        obj = json.loads(line)
+
+        heavy = IntermediateBundle(
+            doc_type=DocType(obj['doc_type']),
+            releases=[entity_from_json(json.dumps(re), ReleaseEntity) for re in obj['releases']],
+            biblio_release_ident=obj.get('biblio_release_ident'),
+            grobid_fulltext=obj.get('grobid_fulltext'),
+            pdftotext_fulltext=obj.get('pdftotext_fulltext'),
+            sim_fulltext=obj.get('sim_fulltext'),
+        )
+        es_doc = transform_heavy(heavy)
+        if not es_doc:
+            continue
+        print(es_doc.json())
+
+def main():
+    """
+    Run this command like:
+
+        python -m fatcat_scholar.transform
+    """
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    subparsers = parser.add_subparsers()
+
+    sub = subparsers.add_parser('run_transform',
+        help="iterates through 'heavy' intermediate")
+    sub.set_defaults(func='run_transform')
+    sub.add_argument("json_file",
+        help="intermediate globs as JSON-lines",
+        nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+
+    args = parser.parse_args()
+    if not args.__dict__.get("func"):
+        print("tell me what to do! (try --help)")
+        sys.exit(-1)
+
+    if args.func == 'run_transform':
+        run_transform(infile=args.json_file)
+    else:
+        raise NotImplementedError(args.func)
+
+if __name__=="__main__":
+    main()
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index b9dcbe8..081878c 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -12,7 +12,7 @@ from fatcat_scholar.api_entities import *
 from fatcat_scholar.djvu import djvu_extract_leaf_texts
 from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
 from fatcat_scholar.issue_db import IssueDB, SimIssueRow
-from fatcat_scholar.es_transform import es_biblio_from_release, es_release_from_release, DocType
+from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle
 
 
 def parse_pages(raw: str) -> Tuple[Optional[int], Optional[int]]:
@@ -44,21 +44,6 @@ def test_parse_pages():
     assert parse_pages("iiv") == (None, None)
 
 
-class IntermediateBundle(BaseModel):
-    doc_type: DocType
-    releases: List[ReleaseEntity]
-    biblio_release_ident: Optional[str]
-    grobid_fulltext: Optional[Any]
-    pdftotext_fulltext: Optional[Any]
-    sim_fulltext: Optional[Any]
-
-    class Config:
-        arbitrary_types_allowed = True
-        json_encoders = {
-            ReleaseEntity: lambda re: entity_to_dict(re),
-        }
-
-
 def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]:
     """
     Returns a list of release idents in preference order (best first) to
-- 
cgit v1.2.3