aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/transform.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r--fatcat_scholar/transform.py267
1 files changed, 158 insertions, 109 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 953ebff..b5a0223 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -1,4 +1,3 @@
-
import os
import io
import sys
@@ -10,83 +9,89 @@ import internetarchive
from fatcat_scholar.api_entities import *
from fatcat_scholar.djvu import djvu_extract_leaf_texts
-from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
+from fatcat_scholar.sandcrawler import (
+ SandcrawlerPostgrestClient,
+ SandcrawlerMinioClient,
+)
from fatcat_scholar.issue_db import IssueDB, SimIssueRow
from fatcat_scholar.schema import *
from fatcat_scholar.grobid2json import teixml2json
def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]:
- if not sim['page_texts']:
+ if not sim["page_texts"]:
return None
- first_page = sim['page_texts'][0]['page_num']
- issue_item = sim['issue_item']
+ first_page = sim["page_texts"][0]["page_num"]
+ issue_item = sim["issue_item"]
return ScholarFulltext(
- lang_code=None, # TODO: pub/issue metadata? or langdetect?
- body="\n".join([p['raw_text'] for p in sim['page_texts']]),
- #acknowledgement=None,
- #annex=None,
- release_ident=sim.get('release_ident'),
- #file_ident=None,
- #file_sha1=None,
- #file_mimetype=None,
+ lang_code=None, # TODO: pub/issue metadata? or langdetect?
+ body="\n".join([p["raw_text"] for p in sim["page_texts"]]),
+ # acknowledgement=None,
+ # annex=None,
+ release_ident=sim.get("release_ident"),
+ # file_ident=None,
+ # file_sha1=None,
+ # file_mimetype=None,
thumbnail_url=f"https://archive.org/serve/{issue_item}/__ia_thumb.jpg",
access_url=f"https://archive.org/details/{issue_item}/page/{first_page}",
access_type=AccessType.ia_sim,
)
+
def es_sim_from_sim(sim: Dict[str, Any]) -> ScholarSim:
first_page = None
- if sim['page_texts']:
- first_page = sim['page_texts'][0]['page_num']
+ if sim["page_texts"]:
+ first_page = sim["page_texts"][0]["page_num"]
return ScholarSim(
- issue_item=sim['issue_item'],
- pub_collection=sim['pub_item_metadata']['metadata']['identifier'],
- sim_pubid=sim['issue_item_metadata']['metadata']['sim_pubid'],
+ issue_item=sim["issue_item"],
+ pub_collection=sim["pub_item_metadata"]["metadata"]["identifier"],
+ sim_pubid=sim["issue_item_metadata"]["metadata"]["sim_pubid"],
first_page=first_page,
)
+
SIM_RELEASE_TYPE_MAP = {
- 'Scholarly Journals': 'article-journal',
+ "Scholarly Journals": "article-journal",
# TODO:
}
SIM_LANG_MAP = {
- 'English': 'en',
+ "English": "en",
# TODO:
}
SIM_COUNTRY_MAP = {
- 'Netherlands': 'nl',
+ "Netherlands": "nl",
# TODO:
}
+
def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
- issue_meta = sim['issue_item_metadata']['metadata']
- pub_meta = sim['pub_item_metadata']['metadata']
+ issue_meta = sim["issue_item_metadata"]["metadata"]
+ pub_meta = sim["pub_item_metadata"]["metadata"]
first_page = None
- if sim['page_texts']:
- first_page = sim['page_texts'][0]['page_num']
- container_name = sim['pub_item_metadata']['metadata']['title']
+ if sim["page_texts"]:
+ first_page = sim["page_texts"][0]["page_num"]
+ container_name = sim["pub_item_metadata"]["metadata"]["title"]
last_word = container_name.split()[-1]
- if len(last_word) == 9 and last_word[4] == '-':
+ if len(last_word) == 9 and last_word[4] == "-":
container_name = container_name[:-10]
issns = []
- raw_issn = issue_meta.get('issn')
+ raw_issn = issue_meta.get("issn")
if raw_issn and len(raw_issn) == 9:
issns.append(raw_issn)
- volume = issue_meta.get('volume')
+ volume = issue_meta.get("volume")
volume_int = None
if volume and volume.isdigit():
volume_int = int(volume)
- issue = issue_meta.get('issue')
+ issue = issue_meta.get("issue")
issue_int = None
if issue and issue.isdigit():
issue_int = int(issue)
- date = issue_meta.get('date')
+ date = issue_meta.get("date")
release_year = None
if date and len(date) > 4 and date[:4].isdigit():
release_year = int(date[:4])
@@ -96,52 +101,52 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
release_date = date
return ScholarBiblio(
- #release_ident=release.ident,
+ # release_ident=release.ident,
title=None,
- #subtitle=None,
- #original_title=release.original_title,
+ # subtitle=None,
+ # original_title=release.original_title,
release_date=release_date,
release_year=release_year,
- release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get('pub_type')),
- release_stage="published", # as a default
- #withdrawn_status=release.withdrawn_status,
- lang_code=SIM_LANG_MAP.get(pub_meta.get('language')),
- country_code=SIM_COUNTRY_MAP.get(pub_meta.get('country')),
+ release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get("pub_type")),
+ release_stage="published", # as a default
+ # withdrawn_status=release.withdrawn_status,
+ lang_code=SIM_LANG_MAP.get(pub_meta.get("language")),
+ country_code=SIM_COUNTRY_MAP.get(pub_meta.get("country")),
volume=volume,
volume_int=volume_int,
issue=issue,
issue_int=issue_int,
- pages=sim.get('pages'),
+ pages=sim.get("pages"),
first_page=first_page,
first_page_int=None,
- #number=None,
-
+ # number=None,
# no external identifiers
-
- #license_slug=release.license_slug,
- publisher=issue_meta.get('publisher'),
+ # license_slug=release.license_slug,
+ publisher=issue_meta.get("publisher"),
container_name=container_name,
- container_original_name=None, # TODO pass-through
- container_ident=None, # TODO: pass-through
- container_type=None, # TODO
- container_issnl=None, # TODO: pass-through
+ container_original_name=None, # TODO pass-through
+ container_ident=None, # TODO: pass-through
+ container_type=None, # TODO
+ container_issnl=None, # TODO: pass-through
issns=issns,
-
# no contrib/affiliation info
contrib_names=[],
affiliations=[],
)
-def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity) -> ScholarFulltext:
+
+def _add_file_release_meta(
+ fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity
+) -> ScholarFulltext:
best_url = None
best_url_type = None
for url in fe.urls:
best_url = url.url
best_url_type = AccessType.web
- if '//archive.org/' in url.url:
+ if "//archive.org/" in url.url:
best_url_type = AccessType.ia_file
break
- elif '//web.archive.org/' in url.url:
+ elif "//web.archive.org/" in url.url:
best_url_type = AccessType.wayback
break
if url.rel == "repository":
@@ -157,30 +162,36 @@ def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: Fil
return fulltext
-def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+def es_fulltext_from_grobid(
+ tei_xml: str, re: ReleaseEntity, fe: FileEntity
+) -> Optional[ScholarFulltext]:
obj = teixml2json(tei_xml)
- if not obj.get('body'):
+ if not obj.get("body"):
return None
ret = ScholarFulltext(
- lang_code=obj.get('lang'),
- body=obj.get('body'),
- acknowledgement=obj.get('acknowledgement'),
- annex=obj.get('annex'),
- thumbnail_url=None, # TODO: sandcrawler thumbnails
+ lang_code=obj.get("lang"),
+ body=obj.get("body"),
+ acknowledgement=obj.get("acknowledgement"),
+ annex=obj.get("annex"),
+ thumbnail_url=None, # TODO: sandcrawler thumbnails
)
return _add_file_release_meta(ret, re, fe)
-def es_fulltext_from_pdftotext(pdftotext: Any, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+
+def es_fulltext_from_pdftotext(
+ pdftotext: Any, re: ReleaseEntity, fe: FileEntity
+) -> Optional[ScholarFulltext]:
ret = ScholarFulltext(
lang_code=re.language,
- body=pdftotext['raw_text'],
+ body=pdftotext["raw_text"],
acknowledgement=None,
annex=None,
- thumbnail_url=None, # TODO: sandcrawler thumbnails
+ thumbnail_url=None, # TODO: sandcrawler thumbnails
)
return _add_file_release_meta(ret, re, fe)
+
def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
tags: List[str] = []
@@ -203,7 +214,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
work_ident = heavy.releases[0].work_id
key = f"work_{work_ident}"
assert heavy.biblio_release_ident
- primary_release = [r for r in heavy.releases if r.ident == heavy.biblio_release_ident][0]
+ primary_release = [
+ r for r in heavy.releases if r.ident == heavy.biblio_release_ident
+ ][0]
biblio = es_biblio_from_release(primary_release)
# TODO: abstracts from releases also; abstracts_dict; abstracts from GROBID parse
@@ -212,19 +225,44 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
raise NotImplementedError(f"doc_type: {heavy.doc_type}")
if heavy.grobid_fulltext:
- fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
- fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
- fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file)
+ fulltext_release = [
+ r
+ for r in heavy.releases
+ if r.ident == heavy.grobid_fulltext["release_ident"]
+ ][0]
+ fulltext_file = [
+ f
+ for f in fulltext_release.files
+ if f.ident == heavy.grobid_fulltext["file_ident"]
+ ][0]
+ fulltext = es_fulltext_from_grobid(
+ heavy.grobid_fulltext["tei_xml"], fulltext_release, fulltext_file
+ )
# hack to pull through thumbnail from local pdftotext
- if fulltext and fulltext.file_sha1 and not fulltext.thumbnail_url and heavy.pdftotext_fulltext:
+ if (
+ fulltext
+ and fulltext.file_sha1
+ and not fulltext.thumbnail_url
+ and heavy.pdftotext_fulltext
+ ):
# https://covid19.fatcat.wiki/fulltext_web/thumbnail/c9/c9e87f843b3cf7dc47881fa3d3ccb4693d7d9521.png
fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png"
if not fulltext and heavy.pdftotext_fulltext:
- fulltext_release = [r for r in heavy.releases if r.ident == heavy.pdftotext_fulltext['release_ident']][0]
- fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.pdftotext_fulltext['file_ident']][0]
- fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file)
+ fulltext_release = [
+ r
+ for r in heavy.releases
+ if r.ident == heavy.pdftotext_fulltext["release_ident"]
+ ][0]
+ fulltext_file = [
+ f
+ for f in fulltext_release.files
+ if f.ident == heavy.pdftotext_fulltext["file_ident"]
+ ][0]
+ fulltext = es_fulltext_from_pdftotext(
+ heavy.pdftotext_fulltext, fulltext_release, fulltext_file
+ )
# TODO: additional access list
access_dict = dict()
@@ -246,41 +284,41 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
# tags
if biblio.license_slug and biblio.license_slug.lower().startswith("cc-"):
- tags.append('oa')
+ tags.append("oa")
if primary_release and primary_release.container:
container = primary_release.container
if container.extra:
- if container.extra.get('doaj'):
- tags.append('doaj')
- tags.append('oa')
- if container.extra.get('road'):
- tags.append('road')
- tags.append('oa')
- if container.extra.get('szczepanski'):
- tags.append('szczepanski')
- tags.append('oa')
- if container.extra.get('ia', {}).get('longtail_oa'):
- tags.append('longtail')
- tags.append('oa')
- if container.extra.get('sherpa_romeo', {}).get('color') == 'white':
- tags.append('oa')
- if container.extra.get('default_license', '').lower().startswith('cc-'):
- tags.append('oa')
- if container.extra.get('platform'):
+ if container.extra.get("doaj"):
+ tags.append("doaj")
+ tags.append("oa")
+ if container.extra.get("road"):
+ tags.append("road")
+ tags.append("oa")
+ if container.extra.get("szczepanski"):
+ tags.append("szczepanski")
+ tags.append("oa")
+ if container.extra.get("ia", {}).get("longtail_oa"):
+ tags.append("longtail")
+ tags.append("oa")
+ if container.extra.get("sherpa_romeo", {}).get("color") == "white":
+ tags.append("oa")
+ if container.extra.get("default_license", "").lower().startswith("cc-"):
+ tags.append("oa")
+ if container.extra.get("platform"):
# scielo, ojs, wordpress, etc
- tags.append(container.extra['platform'].lower())
- if biblio.doi_prefix == '10.2307':
- tags.append('jstor')
+ tags.append(container.extra["platform"].lower())
+ if biblio.doi_prefix == "10.2307":
+ tags.append("jstor")
# biorxiv/medrxiv hacks
if not biblio.container_name and biblio.release_stage != "published":
for _, acc in access_dict.items():
if "://www.medrxiv.org/" in acc.access_url:
- biblio.container_name = 'medRxiv'
+ biblio.container_name = "medRxiv"
if biblio.release_stage == None:
biblio.release_stage = "submitted"
elif "://www.biorxiv.org/" in acc.access_url:
- biblio.container_name = 'bioRxiv'
+ biblio.container_name = "bioRxiv"
if biblio.release_stage == None:
biblio.release_stage = "submitted"
tags = list(set(tags))
@@ -291,7 +329,6 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
doc_index_ts=datetime.datetime.utcnow(),
work_ident=work_ident,
tags=tags,
-
biblio=biblio,
fulltext=fulltext,
ia_sim=ia_sim,
@@ -300,23 +337,28 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
access=list(access_dict.values()),
)
+
def run_transform(infile):
for line in infile:
obj = json.loads(line)
heavy = IntermediateBundle(
- doc_type=DocType(obj['doc_type']),
- releases=[entity_from_json(json.dumps(re), ReleaseEntity) for re in obj['releases']],
- biblio_release_ident=obj.get('biblio_release_ident'),
- grobid_fulltext=obj.get('grobid_fulltext'),
- pdftotext_fulltext=obj.get('pdftotext_fulltext'),
- sim_fulltext=obj.get('sim_fulltext'),
+ doc_type=DocType(obj["doc_type"]),
+ releases=[
+ entity_from_json(json.dumps(re), ReleaseEntity)
+ for re in obj["releases"]
+ ],
+ biblio_release_ident=obj.get("biblio_release_ident"),
+ grobid_fulltext=obj.get("grobid_fulltext"),
+ pdftotext_fulltext=obj.get("pdftotext_fulltext"),
+ sim_fulltext=obj.get("sim_fulltext"),
)
es_doc = transform_heavy(heavy)
if not es_doc:
continue
print(es_doc.json())
+
def main():
"""
Run this command like:
@@ -325,25 +367,32 @@ def main():
"""
parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
subparsers = parser.add_subparsers()
- sub = subparsers.add_parser('run_transform',
- help="iterates through 'heavy' intermediate")
- sub.set_defaults(func='run_transform')
- sub.add_argument("json_file",
+ sub = subparsers.add_parser(
+ "run_transform", help="iterates through 'heavy' intermediate"
+ )
+ sub.set_defaults(func="run_transform")
+ sub.add_argument(
+ "json_file",
help="intermediate globs as JSON-lines",
- nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+ nargs="?",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do! (try --help)")
sys.exit(-1)
- if args.func == 'run_transform':
+ if args.func == "run_transform":
run_transform(infile=args.json_file)
else:
raise NotImplementedError(args.func)
-if __name__=="__main__":
+
+if __name__ == "__main__":
main()