aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-20 13:27:55 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-20 13:27:55 -0700
commitf2c465fffc76ca752249e11d32673db43efc35f1 (patch)
treee000389c916c56c322a984ebdf440a2c6129a0e0
parent3ee18580dd108c69c01cdf838a7f1a7d3d181629 (diff)
downloadfatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.tar.gz
fatcat-scholar-f2c465fffc76ca752249e11d32673db43efc35f1.zip
first pass transform from pipelines to ES schema
-rwxr-xr-xfatcat_scholar/grobid2json.py201
-rw-r--r--fatcat_scholar/schema.py (renamed from fatcat_scholar/es_transform.py)30
-rw-r--r--fatcat_scholar/sim_pipeline.py12
-rw-r--r--fatcat_scholar/transform.py306
-rw-r--r--fatcat_scholar/work_pipeline.py17
-rw-r--r--tests/test_transform.py2
6 files changed, 541 insertions, 27 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py
new file mode 100755
index 0000000..9c2ffad
--- /dev/null
+++ b/fatcat_scholar/grobid2json.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+
+"""
+NB: adapted to work as a library for PDF extraction. Will probably be
+re-written eventually to be correct, complete, and robust; this is just a
+first iteration.
+
+This script tries to extract everything from a GROBID TEI XML fulltext dump:
+
+- header metadata
+- affiliations
+- references (with context)
+- abstract
+- fulltext
+- tables, figures, equations
+
+A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
+
+- abstract
+- fulltext
+- tables, figures, equations
+
+Prints JSON to stdout, errors to stderr
+
+This file copied from the sandcrawler repository.
+"""
+
+import io
+import json
+import argparse
+import xml.etree.ElementTree as ET
+
+xml_ns = "http://www.w3.org/XML/1998/namespace"
+ns = "http://www.tei-c.org/ns/1.0"
+
+def all_authors(elem):
+ names = []
+ for author in elem.findall('.//{%s}author' % ns):
+ pn = author.find('./{%s}persName' % ns)
+ if not pn:
+ continue
+ given_name = pn.findtext('./{%s}forename' % ns) or None
+ surname = pn.findtext('./{%s}surname' % ns) or None
+ full_name = ' '.join(pn.itertext())
+ obj = dict(name=full_name)
+ if given_name:
+ obj['given_name'] = given_name
+ if surname:
+ obj['surname'] = surname
+ ae = author.find('./{%s}affiliation' % ns)
+ if ae:
+ affiliation = dict()
+ for on in ae.findall('./{%s}orgName' % ns):
+ affiliation[on.get('type')] = on.text
+ addr_e = ae.find('./{%s}address' % ns)
+ if addr_e:
+ address = dict()
+ for t in addr_e.getchildren():
+ address[t.tag.split('}')[-1]] = t.text
+ if address:
+ affiliation['address'] = address
+ #affiliation['address'] = {
+ # 'post_code': addr.findtext('./{%s}postCode' % ns) or None,
+ # 'settlement': addr.findtext('./{%s}settlement' % ns) or None,
+ # 'country': addr.findtext('./{%s}country' % ns) or None,
+ #}
+ obj['affiliation'] = affiliation
+ names.append(obj)
+ return names
+
+
+def journal_info(elem):
+ journal = dict()
+ journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
+ journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+ if journal['publisher'] == '':
+ journal['publisher'] = None
+ journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
+ journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
+ journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+ journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ keys = list(journal.keys())
+
+ # remove empty/null keys
+ for k in keys:
+ if not journal[k]:
+ journal.pop(k)
+ return journal
+
+
+def biblio_info(elem):
+ ref = dict()
+ ref['id'] = elem.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
+ # Title stuff is messy in references...
+ ref['title'] = elem.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
+ other_title = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
+ if other_title:
+ if ref['title']:
+ ref['journal'] = other_title
+ else:
+ ref['journal'] = None
+ ref['title'] = other_title
+ ref['authors'] = all_authors(elem)
+ ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+ if ref['publisher'] == '':
+ ref['publisher'] = None
+ date = elem.find('.//{%s}date[@type="published"]' % ns)
+ ref['date'] = (date != None) and date.attrib.get('when')
+ ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+ ref['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ el = elem.find('.//{%s}ptr[@target]' % ns)
+ if el is not None:
+ ref['url'] = el.attrib['target']
+ # Hand correction
+ if ref['url'].endswith(".Lastaccessed"):
+ ref['url'] = ref['url'].replace(".Lastaccessed", "")
+ else:
+ ref['url'] = None
+ return ref
+
+
+def teixml2json(content, encumbered=True):
+
+ if type(content) == str:
+ content = io.StringIO(content)
+ elif type(content) == bytes:
+ content = io.BytesIO(content)
+
+ info = dict()
+
+ #print(content)
+ #print(content.getvalue())
+ tree = ET.parse(content)
+ tei = tree.getroot()
+
+ header = tei.find('.//{%s}teiHeader' % ns)
+ if header is None:
+ raise ValueError("XML does not look like TEI format")
+ application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0]
+ info['grobid_version'] = application_tag.attrib['version'].strip()
+ info['grobid_timestamp'] = application_tag.attrib['when'].strip()
+ info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
+ info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns)))
+ info['journal'] = journal_info(header)
+ date = header.find('.//{%s}date[@type="published"]' % ns)
+ info['date'] = (date != None) and date.attrib.get('when')
+ info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
+ info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
+ if info['doi']:
+ info['doi'] = info['doi'].lower()
+
+ refs = []
+ for (i, bs) in enumerate(tei.findall('.//{%s}listBibl/{%s}biblStruct' % (ns, ns))):
+ ref = biblio_info(bs)
+ ref['index'] = i
+ refs.append(ref)
+ info['citations'] = refs
+
+ text = tei.find('.//{%s}text' % (ns))
+ #print(text.attrib)
+ if text.attrib.get('{%s}lang' % xml_ns):
+ info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang
+
+ if encumbered:
+ el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns))
+ info['abstract'] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find('.//{%s}text/{%s}body' % (ns, ns))
+ info['body'] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
+ info['acknowledgement'] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
+ info['annex'] = (el or None) and " ".join(el.itertext()).strip()
+
+ # remove empty/null keys
+ keys = list(info.keys())
+ for k in keys:
+ if not info[k]:
+ info.pop(k)
+ return info
+
+def main(): # pragma no cover
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description="GROBID TEI XML to JSON",
+ usage="%(prog)s [options] <teifile>...")
+ parser.add_argument("--no-encumbered",
+ action="store_true",
+ help="don't include ambiguously copyright encumbered fields (eg, abstract, body)")
+ parser.add_argument("teifiles", nargs='+')
+
+ args = parser.parse_args()
+
+ for filename in args.teifiles:
+ content = open(filename, 'r')
+ print(json.dumps(
+ teixml2json(content,
+ encumbered=(not args.no_encumbered)),
+ sort_keys=True))
+
+if __name__=='__main__': # pragma no cover
+ main()
diff --git a/fatcat_scholar/es_transform.py b/fatcat_scholar/schema.py
index 464b082..aa4ed52 100644
--- a/fatcat_scholar/es_transform.py
+++ b/fatcat_scholar/schema.py
@@ -6,23 +6,40 @@ auto-conversion of datetime objects.
"""
import ftfy
-import typing
import datetime
from enum import Enum
-from typing import Optional, List
+from typing import Optional, List, Any
from xml.etree import cElementTree as ET
from pydantic import BaseModel
+
from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
+from fatcat_scholar.api_entities import entity_to_dict
class DocType(str, Enum):
work = "work"
sim_page = "sim_page"
+class IntermediateBundle(BaseModel):
+ doc_type: DocType
+ releases: List[ReleaseEntity]
+ biblio_release_ident: Optional[str]
+ grobid_fulltext: Optional[Any]
+ pdftotext_fulltext: Optional[Any]
+ sim_fulltext: Optional[Any]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ ReleaseEntity: lambda re: entity_to_dict(re),
+ }
+
+
class AccessType(str, Enum):
ia_sim = "ia_sim"
ia_file = "ia_file"
wayback = "wayback"
+ web = "web"
repository = "repository"
paywall = "paywall"
loginwall = "loginwall"
@@ -30,7 +47,7 @@ class AccessType(str, Enum):
class ScholarBiblio(BaseModel):
release_ident: Optional[str]
- title: str
+ title: Optional[str]
subtitle: Optional[str]
original_title: Optional[str]
release_date: Optional[datetime.date]
@@ -127,7 +144,7 @@ class ScholarAbstract(BaseModel):
class ScholarAccess(BaseModel):
access_type: AccessType
access_url: str
- mimetype: str
+ mimetype: Optional[str]
file_ident: Optional[str]
release_ident: Optional[str]
@@ -139,8 +156,8 @@ class ScholarDoc(BaseModel):
tags: List[str] = []
biblio: ScholarBiblio
- fulltext: ScholarFulltext
- ia_sim: ScholarSim
+ fulltext: Optional[ScholarFulltext]
+ ia_sim: Optional[ScholarSim]
abstracts: List[ScholarAbstract]
releases: List[ScholarRelease]
access: List[ScholarAccess]
@@ -187,6 +204,7 @@ def contrib_name(contrib: ReleaseContrib) -> str:
return contrib.given_name
def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
+ # TODO
return None
def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index 1dd6476..4315e70 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -13,8 +13,7 @@ from fatcat_scholar.api_entities import *
from fatcat_scholar.djvu import djvu_extract_leaf_texts
from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
from fatcat_scholar.issue_db import IssueDB, SimIssueRow
-from fatcat_scholar.es_transform import es_biblio_from_release, es_release_from_release, DocType
-from fatcat_scholar.work_pipeline import IntermediateBundle
+from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle
def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]:
@@ -52,8 +51,9 @@ class SimPipeline():
issue_item
pages: str
page_texts: list
- page_number
raw_text
+ page_num
+ leaf_num
release_ident: Optional[str]
pub_item_metadata
issue_item_metadata
@@ -107,6 +107,10 @@ class SimPipeline():
self.issue_db.db.row_factory = sqlite3.Row
cur = self.issue_db.db.cursor()
for row in cur.execute('SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3'):
+ # filter out "contents" and "index" items
+ # TODO: more filters; also redundant with IssueDB code?
+ if row['issue_item'].endswith('_contents') or row['issue_item'].endswith('_index'):
+ continue
full_issue = self.fetch_sim_issue(row)
if not full_issue:
continue
@@ -120,7 +124,7 @@ class SimPipeline():
sim_fulltext=dict(
issue_item=full_issue['issue_item'],
pages=str(leaf['page_num']),
- page_texts=[leaf['raw_text']],
+ page_texts=[leaf],
release_ident=None,
pub_item_metadata=full_issue['pub_item_metadata'],
issue_item_metadata=full_issue['issue_item_metadata'],
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
new file mode 100644
index 0000000..54d3f71
--- /dev/null
+++ b/fatcat_scholar/transform.py
@@ -0,0 +1,306 @@
+
+import os
+import io
+import sys
+import argparse
+from pydantic import BaseModel, validator
+from typing import List, Dict, Tuple, Optional, Any, Sequence
+from fatcat_openapi_client import ReleaseEntity, FileEntity
+import internetarchive
+
+from fatcat_scholar.api_entities import *
+from fatcat_scholar.djvu import djvu_extract_leaf_texts
+from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
+from fatcat_scholar.issue_db import IssueDB, SimIssueRow
+from fatcat_scholar.schema import *
+from fatcat_scholar.grobid2json import teixml2json
+
+
+def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]:
+ if not sim['page_texts']:
+ return None
+ first_page = sim['page_texts'][0]['page_num']
+ issue_item = sim['issue_item']
+ return ScholarFulltext(
+ lang_code=None, # TODO: pub/issue metadata? or langdetect?
+ body="\n".join([p['raw_text'] for p in sim['page_texts']]),
+ #acknowledgement=None,
+ #annex=None,
+ release_ident=sim.get('release_ident'),
+ #file_ident=None,
+ #file_sha1=None,
+ #file_mimetype=None,
+ thumbnail_url=f"https://archive.org/serve/{issue_item}/__ia_thumb.jpg",
+ access_url=f"https://archive.org/details/{issue_item}/page/{first_page}",
+ access_type=AccessType.ia_sim,
+ )
+
+def es_sim_from_sim(sim: Dict[str, Any]) -> ScholarSim:
+ first_page = None
+ if sim['page_texts']:
+ first_page = sim['page_texts'][0]['page_num']
+ return ScholarSim(
+ issue_item=sim['issue_item'],
+ pub_collection=sim['pub_item_metadata']['metadata']['identifier'],
+ sim_pubid=sim['issue_item_metadata']['metadata']['sim_pubid'],
+ first_page=first_page,
+ )
+
+SIM_RELEASE_TYPE_MAP = {
+ 'Scholarly Journals': 'article-journal',
+ # TODO:
+}
+SIM_LANG_MAP = {
+ 'English': 'en',
+ # TODO:
+}
+SIM_COUNTRY_MAP = {
+ 'Netherlands': 'nl',
+ # TODO:
+}
+
+def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
+
+ issue_meta = sim['issue_item_metadata']['metadata']
+ pub_meta = sim['pub_item_metadata']['metadata']
+
+ first_page = None
+ if sim['page_texts']:
+ first_page = sim['page_texts'][0]['page_num']
+ container_name = sim['pub_item_metadata']['metadata']['title']
+ last_word = container_name.split()[-1]
+ if len(last_word) == 9 and last_word[4] == '-':
+ container_name = container_name[:-10]
+
+ issns = []
+ raw_issn = issue_meta.get('issn')
+ if raw_issn and len(raw_issn) == 9:
+ issns.append(raw_issn)
+
+ volume = issue_meta.get('volume')
+ volume_int = None
+ if volume and volume.isdigit():
+ volume_int = int(volume)
+ issue = issue_meta.get('issue')
+ issue_int = None
+ if issue and issue.isdigit():
+ issue_int = int(issue)
+
+ date = issue_meta.get('date')
+ release_year = None
+ if date and len(date) > 4 and date[:4].isdigit():
+ release_year = int(date[:4])
+
+ release_date = None
+ if len(date) == len("2000-01-01"):
+ release_date = date
+
+ return ScholarBiblio(
+ #release_ident=release.ident,
+ title=None,
+ #subtitle=None,
+ #original_title=release.original_title,
+ release_date=release_date,
+ release_year=release_year,
+ release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get('pub_type')),
+ release_stage="published", # as a default
+ #withdrawn_status=release.withdrawn_status,
+ lang_code=SIM_LANG_MAP.get(pub_meta.get('language')),
+ country_code=SIM_COUNTRY_MAP.get(pub_meta.get('country')),
+ volume=volume,
+ volume_int=volume_int,
+ issue=issue,
+ issue_int=issue_int,
+ pages=sim.get('pages'),
+ first_page=first_page,
+ first_page_int=None,
+ #number=None,
+
+ # no external identifiers
+
+ #license_slug=release.license_slug,
+ publisher=issue_meta.get('publisher'),
+ container_name=container_name,
+ container_original_name=None, # TODO pass-through
+ container_ident=None, # TODO: pass-through
+ container_type=None, # TODO
+ container_issnl=None, # TODO: pass-through
+ issns=issns,
+
+ # no contrib/affiliation info
+ contrib_names=[],
+ affiliations=[],
+ )
+
+def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity) -> ScholarFulltext:
+ best_url = None
+ best_url_type = None
+ for url in fe.urls:
+ best_url = url.url
+ best_url_type = AccessType.web
+ if '//archive.org/' in url.url:
+ best_url_type = AccessType.ia_file
+ break
+ elif '//web.archive.org/' in url.url:
+ best_url_type = AccessType.wayback
+ break
+ if url.rel == "repository":
+ best_url_type = AccessType.repository
+ # TODO: more file-to-access logic
+
+ fulltext.release_ident = re.ident
+ fulltext.file_ident = fe.ident
+ fulltext.file_sha1 = fe.sha1
+ fulltext.file_mimetype = fe.mimetype
+ fulltext.access_url = best_url
+ fulltext.access_type = best_url_type
+ return fulltext
+
+
+def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+ obj = teixml2json(tei_xml)
+ ret = ScholarFulltext(
+ lang_code=obj.get('lang'),
+ body=obj.get('body'),
+ acknowledgement=obj.get('acknowledgement'),
+ annex=obj.get('annex'),
+ thumbnail_url=None, # TODO: sandcrawler thumbnails
+ )
+ return _add_file_release_meta(ret, re, fe)
+
+def es_fulltext_from_pdftotext(pdftotext: Any, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+
+ ret = ScholarFulltext(
+ lang_code=re.language,
+ body=pdftotext['raw_text'],
+ acknowledgement=None,
+ annex=None,
+ thumbnail_url=None, # TODO: sandcrawler thumbnails
+ )
+ return _add_file_release_meta(ret, re, fe)
+
+def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
+
+ tags: List[str] = []
+ work_ident: Optional[str] = None
+ abstracts: List[ScholarAbstract] = []
+ fulltext: Optional[ScholarFulltext]
+
+ ia_sim: Optional[ScholarSim] = None
+ if heavy.sim_fulltext is not None:
+ ia_sim = es_sim_from_sim(heavy.sim_fulltext)
+
+ if heavy.doc_type == DocType.sim_page:
+ assert ia_sim is not None
+ key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}"
+ biblio = es_biblio_from_sim(heavy.sim_fulltext)
+ fulltext = es_fulltext_from_sim(heavy.sim_fulltext)
+ elif heavy.doc_type == DocType.work:
+ work_ident = heavy.releases[0].work_id
+ key = f"work_{work_ident}"
+ assert heavy.biblio_release_ident
+ primary_release = [r for r in heavy.releases if r.ident == heavy.biblio_release_ident][0]
+ biblio = es_biblio_from_release(primary_release)
+
+ # TODO: abstracts from releases also? abstracts_dict?
+ abstracts = es_abstracts_from_release(primary_release)
+ else:
+ raise NotImplementedError(f"doc_type: {heavy.doc_type}")
+
+ if heavy.grobid_fulltext:
+
+ fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
+ fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
+
+ fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file)
+
+ # hack to pull through thumbnail from local pdftotext
+ if fulltext and not fulltext.thumbnail_url and heavy.pdftotext_fulltext:
+ fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/sha1/{fulltext_file.sha1}" # XXX
+
+ if not fulltext and heavy.pdftotext_fulltext:
+
+ fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
+ fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
+ fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file)
+
+ # TODO: additional access list
+ access_dict = dict()
+ if fulltext and fulltext.access_type:
+ access_dict[fulltext.access_type] = ScholarAccess(
+ access_type=fulltext.access_type,
+ access_url=fulltext.access_url,
+ mimetype=fulltext.file_mimetype,
+ file_ident=fulltext.file_ident,
+ release_ident=fulltext.release_ident,
+ )
+ if ia_sim and not AccessType.ia_sim in access_dict:
+ access_dict[AccessType.ia_sim] = ScholarAccess(
+ access_type=AccessType.ia_sim,
+ access_url=f"https://archive.org/details/{ia_sim.issue_item}/page/{ia_sim.first_page}",
+ )
+
+ # TODO: additional abstracts
+
+ return ScholarDoc(
+ key=key,
+ doc_type=heavy.doc_type.value,
+ doc_index_ts=datetime.datetime.utcnow(),
+ work_ident=work_ident,
+ tags=tags,
+
+ biblio=biblio,
+ fulltext=fulltext,
+ ia_sim=ia_sim,
+ abstracts=abstracts,
+ releases=[es_release_from_release(r) for r in heavy.releases],
+ access=list(access_dict.values()),
+ )
+
+def run_transform(infile):
+ for line in infile:
+ obj = json.loads(line)
+
+ heavy = IntermediateBundle(
+ doc_type=DocType(obj['doc_type']),
+ releases=[entity_from_json(json.dumps(re), ReleaseEntity) for re in obj['releases']],
+ biblio_release_ident=obj.get('biblio_release_ident'),
+ grobid_fulltext=obj.get('grobid_fulltext'),
+ pdftotext_fulltext=obj.get('pdftotext_fulltext'),
+ sim_fulltext=obj.get('sim_fulltext'),
+ )
+ es_doc = transform_heavy(heavy)
+ if not es_doc:
+ continue
+ print(es_doc.json())
+
+def main():
+ """
+ Run this command like:
+
+ python -m fatcat_scholar.transform
+ """
+
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ subparsers = parser.add_subparsers()
+
+ sub = subparsers.add_parser('run_transform',
+ help="iterates through 'heavy' intermediate")
+ sub.set_defaults(func='run_transform')
+ sub.add_argument("json_file",
+ help="intermediate globs as JSON-lines",
+ nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do! (try --help)")
+ sys.exit(-1)
+
+ if args.func == 'run_transform':
+ run_transform(infile=args.json_file)
+ else:
+ raise NotImplementedError(args.func)
+
+if __name__=="__main__":
+ main()
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index b9dcbe8..081878c 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -12,7 +12,7 @@ from fatcat_scholar.api_entities import *
from fatcat_scholar.djvu import djvu_extract_leaf_texts
from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
from fatcat_scholar.issue_db import IssueDB, SimIssueRow
-from fatcat_scholar.es_transform import es_biblio_from_release, es_release_from_release, DocType
+from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle
def parse_pages(raw: str) -> Tuple[Optional[int], Optional[int]]:
@@ -44,21 +44,6 @@ def test_parse_pages():
assert parse_pages("iiv") == (None, None)
-class IntermediateBundle(BaseModel):
- doc_type: DocType
- releases: List[ReleaseEntity]
- biblio_release_ident: Optional[str]
- grobid_fulltext: Optional[Any]
- pdftotext_fulltext: Optional[Any]
- sim_fulltext: Optional[Any]
-
- class Config:
- arbitrary_types_allowed = True
- json_encoders = {
- ReleaseEntity: lambda re: entity_to_dict(re),
- }
-
-
def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]:
"""
Returns a list of release idents in preference order (best first) to
diff --git a/tests/test_transform.py b/tests/test_transform.py
index cb1ad5c..3c29d18 100644
--- a/tests/test_transform.py
+++ b/tests/test_transform.py
@@ -2,7 +2,7 @@
import pytest
from fatcat_openapi_client import ReleaseEntity
-from fatcat_scholar.es_transform import *
+from fatcat_scholar.schema import *
from fatcat_scholar.api_entities import *
def test_es_release_from_release():