diff options
-rw-r--r-- | fatcat_scholar/api_entities.py | 6 | ||||
-rw-r--r-- | fatcat_scholar/djvu.py | 12 | ||||
-rwxr-xr-x | fatcat_scholar/grobid2json.py | 174 | ||||
-rw-r--r-- | fatcat_scholar/hacks.py | 6 | ||||
-rw-r--r-- | fatcat_scholar/issue_db.py | 250 | ||||
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 35 | ||||
-rw-r--r-- | fatcat_scholar/schema.py | 102 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 70 | ||||
-rw-r--r-- | fatcat_scholar/sim_pipeline.py | 110 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 267 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 64 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 188 | ||||
-rw-r--r-- | tests/test_djvu_parse.py | 5 | ||||
-rw-r--r-- | tests/test_scrub.py | 7 | ||||
-rw-r--r-- | tests/test_transform.py | 20 |
15 files changed, 797 insertions, 519 deletions
diff --git a/fatcat_scholar/api_entities.py b/fatcat_scholar/api_entities.py index 738c5c8..df24eda 100644 --- a/fatcat_scholar/api_entities.py +++ b/fatcat_scholar/api_entities.py @@ -1,10 +1,10 @@ - import json import collections from fatcat_openapi_client import ApiClient _global_serde_api_client = ApiClient() + def entity_to_dict(entity, api_client=None): """ Hack to take advantage of the code-generated serialization code. @@ -19,6 +19,7 @@ def entity_to_dict(entity, api_client=None): api_client = _global_serde_api_client return api_client.sanitize_for_serialization(entity) + def entity_from_json(json_str, entity_type, api_client=None): """ Hack to take advantage of the code-generated deserialization code @@ -27,10 +28,11 @@ def entity_from_json(json_str, entity_type, api_client=None): """ if not api_client: api_client = _global_serde_api_client - thing = collections.namedtuple('Thing', ['data']) + thing = collections.namedtuple("Thing", ["data"]) thing.data = json_str return api_client.deserialize(thing, entity_type) + def entity_from_dict(obj, entity_type, api_client=None): json_str = json.dumps(obj) return entity_from_json(json_str, entity_type, api_client=api_client) diff --git a/fatcat_scholar/djvu.py b/fatcat_scholar/djvu.py index b4a0774..ca3e412 100644 --- a/fatcat_scholar/djvu.py +++ b/fatcat_scholar/djvu.py @@ -1,9 +1,11 @@ - from io import StringIO from typing import List, Dict, Tuple, Optional, Any, Sequence import xml.etree.ElementTree as ET -def djvu_extract_leaf_texts(blob: StringIO, only_leaves: Optional[List[int]] = None) -> Dict[int, str]: + +def djvu_extract_leaf_texts( + blob: StringIO, only_leaves: Optional[List[int]] = None +) -> Dict[int, str]: """ Takes an in-memory djvu XML string (note: not an actual djvu file, just the IA XML file type), and iterates throug @@ -21,12 +23,12 @@ def djvu_extract_leaf_texts(blob: StringIO, only_leaves: Optional[List[int]] = N continue # <OBJECT data="file://localhost//tmp/derive/ERIC_ED441501//ERIC_ED441501.djvu" height="6545" type="image/x.djvu" usemap="ERIC_ED441501_0002.djvu" width="5048"> - usemap = element.get('usemap') + usemap = element.get("usemap") if not usemap: continue leaf_num = None try: - leaf_num = int(usemap.replace('.djvu', '').split('_')[-1]) + leaf_num = int(usemap.replace(".djvu", "").split("_")[-1]) except: continue if only_leaves is not None and leaf_num is not None: @@ -42,7 +44,7 @@ def djvu_extract_leaf_texts(blob: StringIO, only_leaves: Optional[List[int]] = N if p_text: paragraph_texts.append(p_text) page_text = "\n".join(paragraph_texts) - #print(f"### {leaf_num}\n{page_text}\n") + # print(f"### {leaf_num}\n{page_text}\n") if page_text: leaf_text[leaf_num] = page_text element.clear() diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index 9c2ffad..57d039e 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -33,52 +33,55 @@ import xml.etree.ElementTree as ET xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" + def all_authors(elem): names = [] - for author in elem.findall('.//{%s}author' % ns): - pn = author.find('./{%s}persName' % ns) + for author in elem.findall(".//{%s}author" % ns): + pn = author.find("./{%s}persName" % ns) if not pn: continue - given_name = pn.findtext('./{%s}forename' % ns) or None - surname = pn.findtext('./{%s}surname' % ns) or None - full_name = ' '.join(pn.itertext()) + given_name = pn.findtext("./{%s}forename" % ns) or None + surname = pn.findtext("./{%s}surname" % ns) or None + full_name = " ".join(pn.itertext()) obj = dict(name=full_name) if given_name: - obj['given_name'] = given_name + obj["given_name"] = given_name if surname: - obj['surname'] = surname - ae = author.find('./{%s}affiliation' % ns) + obj["surname"] = surname + ae = author.find("./{%s}affiliation" % ns) if ae: affiliation = dict() - for on in ae.findall('./{%s}orgName' % ns): - affiliation[on.get('type')] = on.text - addr_e = ae.find('./{%s}address' % ns) + for on in ae.findall("./{%s}orgName" % ns): + affiliation[on.get("type")] = on.text + addr_e = ae.find("./{%s}address" % ns) if addr_e: address = dict() for t in addr_e.getchildren(): - address[t.tag.split('}')[-1]] = t.text + address[t.tag.split("}")[-1]] = t.text if address: - affiliation['address'] = address - #affiliation['address'] = { + affiliation["address"] = address + # affiliation['address'] = { # 'post_code': addr.findtext('./{%s}postCode' % ns) or None, # 'settlement': addr.findtext('./{%s}settlement' % ns) or None, # 'country': addr.findtext('./{%s}country' % ns) or None, - #} - obj['affiliation'] = affiliation + # } + obj["affiliation"] = affiliation names.append(obj) return names def journal_info(elem): journal = dict() - journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns)) - journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns)) - if journal['publisher'] == '': - journal['publisher'] = None - journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) - journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) - journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) - journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) + journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns)) + journal["publisher"] = elem.findtext( + ".//{%s}publicationStmt/{%s}publisher" % (ns, ns) + ) + if journal["publisher"] == "": + journal["publisher"] = None + journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) + journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) + journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) + journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) keys = list(journal.keys()) # remove empty/null keys @@ -90,32 +93,32 @@ def journal_info(elem): def biblio_info(elem): ref = dict() - ref['id'] = elem.attrib.get('{http://www.w3.org/XML/1998/namespace}id') + ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") # Title stuff is messy in references... - ref['title'] = elem.findtext('.//{%s}analytic/{%s}title' % (ns, ns)) - other_title = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns)) + ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns)) + other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns)) if other_title: - if ref['title']: - ref['journal'] = other_title + if ref["title"]: + ref["journal"] = other_title else: - ref['journal'] = None - ref['title'] = other_title - ref['authors'] = all_authors(elem) - ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns)) - if ref['publisher'] == '': - ref['publisher'] = None + ref["journal"] = None + ref["title"] = other_title + ref["authors"] = all_authors(elem) + ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns)) + if ref["publisher"] == "": + ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) - ref['date'] = (date != None) and date.attrib.get('when') - ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) - ref['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) - el = elem.find('.//{%s}ptr[@target]' % ns) + ref["date"] = (date != None) and date.attrib.get("when") + ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) + ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) + el = elem.find(".//{%s}ptr[@target]" % ns) if el is not None: - ref['url'] = el.attrib['target'] + ref["url"] = el.attrib["target"] # Hand correction - if ref['url'].endswith(".Lastaccessed"): - ref['url'] = ref['url'].replace(".Lastaccessed", "") + if ref["url"].endswith(".Lastaccessed"): + ref["url"] = ref["url"].replace(".Lastaccessed", "") else: - ref['url'] = None + ref["url"] = None return ref @@ -128,48 +131,50 @@ def teixml2json(content, encumbered=True): info = dict() - #print(content) - #print(content.getvalue()) + # print(content) + # print(content.getvalue()) tree = ET.parse(content) tei = tree.getroot() - header = tei.find('.//{%s}teiHeader' % ns) + header = tei.find(".//{%s}teiHeader" % ns) if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0] - info['grobid_version'] = application_tag.attrib['version'].strip() - info['grobid_timestamp'] = application_tag.attrib['when'].strip() - info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns)) - info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns))) - info['journal'] = journal_info(header) + application_tag = header.findall(".//{%s}appInfo/{%s}application" % (ns, ns))[0] + info["grobid_version"] = application_tag.attrib["version"].strip() + info["grobid_timestamp"] = application_tag.attrib["when"].strip() + info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns)) + info["authors"] = all_authors( + header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns)) + ) + info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) - info['date'] = (date != None) and date.attrib.get('when') - info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) - info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) - if info['doi']: - info['doi'] = info['doi'].lower() + info["date"] = (date != None) and date.attrib.get("when") + info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) + info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) + if info["doi"]: + info["doi"] = info["doi"].lower() refs = [] - for (i, bs) in enumerate(tei.findall('.//{%s}listBibl/{%s}biblStruct' % (ns, ns))): + for (i, bs) in enumerate(tei.findall(".//{%s}listBibl/{%s}biblStruct" % (ns, ns))): ref = biblio_info(bs) - ref['index'] = i + ref["index"] = i refs.append(ref) - info['citations'] = refs + info["citations"] = refs - text = tei.find('.//{%s}text' % (ns)) - #print(text.attrib) - if text.attrib.get('{%s}lang' % xml_ns): - info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang + text = tei.find(".//{%s}text" % (ns)) + # print(text.attrib) + if text.attrib.get("{%s}lang" % xml_ns): + info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang if encumbered: - el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns)) - info['abstract'] = (el or None) and " ".join(el.itertext()).strip() - el = tei.find('.//{%s}text/{%s}body' % (ns, ns)) - info['body'] = (el or None) and " ".join(el.itertext()).strip() + el = tei.find(".//{%s}profileDesc/{%s}abstract" % (ns, ns)) + info["abstract"] = (el or None) and " ".join(el.itertext()).strip() + el = tei.find(".//{%s}text/{%s}body" % (ns, ns)) + info["body"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns)) - info['acknowledgement'] = (el or None) and " ".join(el.itertext()).strip() + info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns)) - info['annex'] = (el or None) and " ".join(el.itertext()).strip() + info["annex"] = (el or None) and " ".join(el.itertext()).strip() # remove empty/null keys keys = list(info.keys()) @@ -178,24 +183,31 @@ def teixml2json(content, encumbered=True): info.pop(k) return info -def main(): # pragma no cover + +def main(): # pragma no cover parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="GROBID TEI XML to JSON", - usage="%(prog)s [options] <teifile>...") - parser.add_argument("--no-encumbered", + usage="%(prog)s [options] <teifile>...", + ) + parser.add_argument( + "--no-encumbered", action="store_true", - help="don't include ambiguously copyright encumbered fields (eg, abstract, body)") - parser.add_argument("teifiles", nargs='+') + help="don't include ambiguously copyright encumbered fields (eg, abstract, body)", + ) + parser.add_argument("teifiles", nargs="+") args = parser.parse_args() for filename in args.teifiles: - content = open(filename, 'r') - print(json.dumps( - teixml2json(content, - encumbered=(not args.no_encumbered)), - sort_keys=True)) + content = open(filename, "r") + print( + json.dumps( + teixml2json(content, encumbered=(not args.no_encumbered)), + sort_keys=True, + ) + ) + -if __name__=='__main__': # pragma no cover +if __name__ == "__main__": # pragma no cover main() diff --git a/fatcat_scholar/hacks.py b/fatcat_scholar/hacks.py index fc1dacd..710a25f 100644 --- a/fatcat_scholar/hacks.py +++ b/fatcat_scholar/hacks.py @@ -1,10 +1,10 @@ - import typing import jinja2 from starlette.background import BackgroundTask from starlette.templating import _TemplateResponse + class Jinja2Templates: """ This is a patched version of starlette.templating.Jinja2Templates that @@ -15,7 +15,9 @@ class Jinja2Templates: assert jinja2 is not None, "jinja2 must be installed to use Jinja2Templates" self.env = self.get_env(directory, extensions) - def get_env(self, directory: str, extensions: typing.List[str] = []) -> "jinja2.Environment": + def get_env( + self, directory: str, extensions: typing.List[str] = [] + ) -> "jinja2.Environment": @jinja2.contextfunction def url_for(context: dict, name: str, **path_params: typing.Any) -> str: request = context["request"] diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py index 4f5ff53..12ffa32 100644 --- a/fatcat_scholar/issue_db.py +++ b/fatcat_scholar/issue_db.py @@ -1,4 +1,3 @@ - import sys import json import sqlite3 @@ -9,6 +8,7 @@ import fatcat_openapi_client import elasticsearch from elasticsearch_dsl import Search, Q + @dataclass class SimPubRow: sim_pubid: str @@ -23,7 +23,17 @@ class SimPubRow: wikidata_qid: Optional[str] def tuple(self): - return (self.sim_pubid, self.pub_collection, self.title, self.issn, self.pub_type, self.publisher, self.container_issnl, self.container_ident, self.wikidata_qid) + return ( + self.sim_pubid, + self.pub_collection, + self.title, + self.issn, + self.pub_type, + self.publisher, + self.container_issnl, + self.container_ident, + self.wikidata_qid, + ) @classmethod def from_tuple(cls, row: Any) -> "SimPubRow": @@ -39,6 +49,7 @@ class SimPubRow: wikidata_qid=row[8], ) + @dataclass class SimIssueRow: """ @@ -46,6 +57,7 @@ class SimIssueRow: - distinguish between release count that can do full link with pages, or just in this year/volume/issue? """ + issue_item: str sim_pubid: str year: Optional[int] @@ -56,7 +68,16 @@ class SimIssueRow: release_count: Optional[int] def tuple(self): - return (self.issue_item, self.sim_pubid, self.year, self.volume, self.issue, self.first_page, self.last_page, self.release_count) + return ( + self.issue_item, + self.sim_pubid, + self.year, + self.volume, + self.issue, + self.first_page, + self.last_page, + self.release_count, + ) @classmethod def from_tuple(cls, row: Any) -> "SimIssueRow": @@ -71,6 +92,7 @@ class SimIssueRow: release_count=row[7], ) + @dataclass class ReleaseCountsRow: sim_pubid: str @@ -80,82 +102,100 @@ class ReleaseCountsRow: volume: Optional[str] def tuple(self): - return (self.sim_pubid, self.year, self.volume, self.year_in_sim, self.release_count) + return ( + self.sim_pubid, + self.year, + self.volume, + self.year_in_sim, + self.release_count, + ) -def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, issue: str) -> int: +def es_issue_count( + es_client: Any, container_id: str, year: int, volume: str, issue: str +) -> int: search = Search(using=es_client, index="fatcat_release") - search = search\ - .filter("term", container_id=container_id)\ - .filter("term", year=year)\ - .filter("term", volume=volume)\ - .filter("term", issue=issue)\ + search = ( + search.filter("term", container_id=container_id) + .filter("term", year=year) + .filter("term", volume=volume) + .filter("term", issue=issue) .extra(request_cache=True) + ) return search.count() + def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]: """ What is being returned is a list of dicts, each with year, volume, count keys. """ search = Search(using=es_client, index="fatcat_release") - search = search\ - .filter("term", container_id=container_id) - search.aggs\ - .bucket('years', 'terms', field="year")\ - .bucket('volumes', 'terms', field="volume") + search = search.filter("term", container_id=container_id) + search.aggs.bucket("years", "terms", field="year").bucket( + "volumes", "terms", field="volume" + ) search = search[:0] res = search.execute() ret = [] for year in res.aggregations.years.buckets: for volume in year.volumes.buckets: ret.append(dict(count=volume.doc_count, year=year.key, volume=volume.key)) - #print(ret[-1]) + # print(ret[-1]) return ret -class IssueDB(): +class IssueDB: def __init__(self, db_file): """ To create a temporary database, pass ":memory:" as db_file """ - self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE') + self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE") self._pubid2container_map: Dict[str, Optional[str]] = dict() self._container2pubid_map: Dict[str, Optional[str]] = dict() def init_db(self): - self.db.executescript(""" + self.db.executescript( + """ PRAGMA main.page_size = 4096; PRAGMA main.cache_size = 20000; PRAGMA main.locking_mode = EXCLUSIVE; PRAGMA main.synchronous = OFF; - """) - with open('schema/issue_db.sql', 'r') as fschema: + """ + ) + with open("schema/issue_db.sql", "r") as fschema: self.db.executescript(fschema.read()) def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() - cur.execute("INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)", - pub.tuple()) + cur.execute( + "INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)", pub.tuple() + ) def insert_sim_issue(self, issue: SimIssueRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() - cur.execute("INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)", - issue.tuple()) + cur.execute( + "INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)", issue.tuple() + ) def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() - cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)", - counts.tuple()) + cur.execute( + "INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)", counts.tuple() + ) def pubid2container(self, sim_pubid: str) -> Optional[str]: if sim_pubid in self._pubid2container_map: return self._pubid2container_map[sim_pubid] - row = list(self.db.execute("SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid])) + row = list( + self.db.execute( + "SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid] + ) + ) if row: self._pubid2container_map[sim_pubid] = row[0][0] return row[0][0] @@ -166,7 +206,12 @@ class IssueDB(): def container2pubid(self, container_ident: str) -> Optional[str]: if container_ident in self._container2pubid_map: return self._container2pubid_map[container_ident] - row = list(self.db.execute("SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;", [container_ident])) + row = list( + self.db.execute( + "SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;", + [container_ident], + ) + ) if row: self._container2pubid_map[container_ident] = row[0][0] return row[0][0] @@ -174,14 +219,23 @@ class IssueDB(): self._pubid2container_map[container_ident] = None return None - def lookup_issue(self, sim_pubid: str, volume: str, issue: str) -> Optional[SimIssueRow]: - row = list(self.db.execute("SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;", [sim_pubid, volume, issue])) + def lookup_issue( + self, sim_pubid: str, volume: str, issue: str + ) -> Optional[SimIssueRow]: + row = list( + self.db.execute( + "SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;", + [sim_pubid, volume, issue], + ) + ) if not row: return None return SimIssueRow.from_tuple(row[0]) def lookup_pub(self, sim_pubid: str) -> Optional[SimPubRow]: - row = list(self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid])) + row = list( + self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]) + ) if not row: return None return SimPubRow.from_tuple(row[0]) @@ -196,22 +250,22 @@ class IssueDB(): if not line: continue obj = json.loads(line) - meta = obj['metadata'] - assert "periodicals" in meta['collection'] + meta = obj["metadata"] + assert "periodicals" in meta["collection"] container: Optional[fatcat_openapi_client.ContainerEntity] = None - if meta.get('issn'): + if meta.get("issn"): try: - container = api.lookup_container(issnl=meta['issn']) + container = api.lookup_container(issnl=meta["issn"]) except fatcat_openapi_client.ApiException as ae: if ae.status != 404: raise ae row = SimPubRow( - sim_pubid=meta['sim_pubid'], - pub_collection=meta['identifier'], - title=meta['title'], - issn=meta.get('issn'), - pub_type=meta.get('pub_type'), - publisher=meta.get('publisher'), + sim_pubid=meta["sim_pubid"], + pub_collection=meta["identifier"], + title=meta["title"], + issn=meta.get("issn"), + pub_type=meta.get("pub_type"), + publisher=meta.get("publisher"), container_issnl=container and container.issnl, container_ident=container and container.ident, wikidata_qid=container and container.wikidata_qid, @@ -230,28 +284,32 @@ class IssueDB(): if not line: continue obj = json.loads(line) - meta = obj['metadata'] - assert "periodicals" in meta['collection'] - #pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0] - issue_item = meta['identifier'] + meta = obj["metadata"] + assert "periodicals" in meta["collection"] + # pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0] + issue_item = meta["identifier"] # don't index meta items # TODO: handle more weird suffixes like "1-2", "_part_1", "_index-contents" if issue_item.endswith("_index") or issue_item.endswith("_contents"): continue - sim_pubid=meta['sim_pubid'] + sim_pubid = meta["sim_pubid"] year: Optional[int] = None - if meta.get('date') and meta['date'][:4].isdigit(): - year = int(meta['date'][:4]) - volume = meta.get('volume') - issue = meta.get('issue') + if meta.get("date") and meta["date"][:4].isdigit(): + year = int(meta["date"][:4]) + volume = meta.get("volume") + issue = meta.get("issue") first_page: Optional[int] = None last_page: Optional[int] = None - if obj.get('page_numbers'): - pages = [p['pageNumber'] for p in obj['page_numbers']['pages'] if p['pageNumber']] + if obj.get("page_numbers"): + pages = [ + p["pageNumber"] + for p in obj["page_numbers"]["pages"] + if p["pageNumber"] + ] pages = [int(p) for p in pages if p.isdigit()] if len(pages): first_page = min(pages) @@ -261,7 +319,9 @@ class IssueDB(): if year and volume and issue: container_id = self.pubid2container(sim_pubid) if container_id: - release_count = es_issue_count(es_client, container_id, year, volume, issue) + release_count = es_issue_count( + es_client, container_id, year, volume, issue + ) row = SimIssueRow( issue_item=issue_item, @@ -278,17 +338,21 @@ class IssueDB(): self.db.commit() def load_counts(self, es_client: Any): - all_pub_containers = list(self.db.execute('SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;')) + all_pub_containers = list( + self.db.execute( + "SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;" + ) + ) cur: Any = self.db.cursor() for (sim_pubid, container_ident) in all_pub_containers: aggs = es_container_aggs(es_client, container_ident) for agg in aggs: row = ReleaseCountsRow( sim_pubid=sim_pubid, - year_in_sim=False, # TODO - release_count=agg['count'], - year=agg['year'], - volume=agg['volume'], + year_in_sim=False, # TODO + release_count=agg["count"], + year=agg["year"], + volume=agg["volume"], ) self.insert_release_counts(row, cur) cur.close() @@ -303,35 +367,48 @@ def main(): """ parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) subparsers = parser.add_subparsers() - parser.add_argument("--db-file", + parser.add_argument( + "--db-file", help="sqlite3 database file to open", - default='data/issue_db.sqlite', - type=str) - - sub = subparsers.add_parser('init_db', - help="create sqlite3 output file and tables") - sub.set_defaults(func='init_db') - - sub = subparsers.add_parser('load_pubs', - help="update container-level stats from JSON file") - sub.set_defaults(func='load_pubs') - sub.add_argument("json_file", + default="data/issue_db.sqlite", + type=str, + ) + + sub = subparsers.add_parser("init_db", help="create sqlite3 output file and tables") + sub.set_defaults(func="init_db") + + sub = subparsers.add_parser( + "load_pubs", help="update container-level stats from JSON file" + ) + sub.set_defaults(func="load_pubs") + sub.add_argument( + "json_file", help="collection-level metadata, as JSON-lines", - nargs='?', default=sys.stdin, type=argparse.FileType('r')) - - sub = subparsers.add_parser('load_issues', - help="update item-level stats from JSON file") - sub.set_defaults(func='load_issues') - sub.add_argument("json_file", + nargs="?", + default=sys.stdin, + type=argparse.FileType("r"), + ) + + sub = subparsers.add_parser( + "load_issues", help="update item-level stats from JSON file" + ) + sub.set_defaults(func="load_issues") + sub.add_argument( + "json_file", help="item-level metadata, as JSON-lines", - nargs='?', default=sys.stdin, type=argparse.FileType('r')) + nargs="?", + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub = subparsers.add_parser('load_counts', - help="update volume-level stats from elasticsearch endpoint") - sub.set_defaults(func='load_counts') + sub = subparsers.add_parser( + "load_counts", help="update volume-level stats from elasticsearch endpoint" + ) + sub.set_defaults(func="load_counts") args = parser.parse_args() if not args.__dict__.get("func"): @@ -342,15 +419,16 @@ def main(): api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient()) es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") - if args.func == 'load_pubs': + if args.func == "load_pubs": idb.load_pubs(args.json_file, api) - elif args.func == 'load_issues': + elif args.func == "load_issues": idb.load_issues(args.json_file, es_client) - elif args.func == 'load_counts': + elif args.func == "load_counts": idb.load_counts(es_client) else: func = getattr(idb, args.func) func() -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index db6014f..408682f 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -1,16 +1,15 @@ - import json import minio import requests from typing import Dict, Optional, Any -class SandcrawlerPostgrestClient(): +class SandcrawlerPostgrestClient: def __init__(self, api_url: str): self.api_url = api_url - + def get_grobid(self, sha1: str) -> Optional[Dict[str, Any]]: - resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1)) + resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1)) resp.raise_for_status() resp_json = resp.json() if resp_json: @@ -20,8 +19,13 @@ class SandcrawlerPostgrestClient(): class SandcrawlerMinioClient(object): - - def __init__(self, host_url: str, access_key: Optional[str] = None, secret_key: Optional[str] = None, default_bucket: Optional[str] = "sandcrawler"): + def __init__( + self, + host_url: str, + access_key: Optional[str] = None, + secret_key: Optional[str] = None, + default_bucket: Optional[str] = "sandcrawler", + ): """ host is minio connection string (host:port) access and secret key are as expected @@ -34,10 +38,7 @@ class SandcrawlerMinioClient(object): secret_key=os.environ['MINIO_SECRET_KEY'], """ self.mc = minio.Minio( - host_url, - access_key=access_key, - secret_key=secret_key, - secure=False, + host_url, access_key=access_key, secret_key=secret_key, secure=False, ) self.default_bucket = default_bucket @@ -48,14 +49,9 @@ class SandcrawlerMinioClient(object): prefix = "" assert len(sha1hex) == 40 obj_path = "{}{}/{}/{}/{}{}".format( - prefix, - folder, - sha1hex[0:2], - sha1hex[2:4], - sha1hex, - extension, + prefix, folder, sha1hex[0:2], sha1hex[2:4], sha1hex, extension, ) - return obj_path + return obj_path def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None): """ @@ -67,9 +63,6 @@ class SandcrawlerMinioClient(object): if not bucket: bucket = self.default_bucket assert bucket - blob = self.mc.get_object( - bucket, - obj_path, - ) + blob = self.mc.get_object(bucket, obj_path,) # TODO: optionally verify SHA-1? return blob.data diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 10742fb..110991d 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -1,4 +1,3 @@ - """ Originally wrote these as dataclasses using pydantic.dataclasses, but we don't get serialization for free with those. This is useful for things like @@ -22,6 +21,7 @@ class DocType(str, Enum): work = "work" sim_page = "sim_page" + class IntermediateBundle(BaseModel): doc_type: DocType releases: List[ReleaseEntity] @@ -47,6 +47,7 @@ class AccessType(str, Enum): loginwall = "loginwall" shadow = "shadow" + class ScholarBiblio(BaseModel): release_ident: Optional[str] title: Optional[str] @@ -60,12 +61,12 @@ class ScholarBiblio(BaseModel): lang_code: Optional[str] country_code: Optional[str] volume: Optional[str] - volume_int: Optional[str] # TODO: needed? + volume_int: Optional[str] # TODO: needed? issue: Optional[str] - issue_int: Optional[str] # TODO: needed? + issue_int: Optional[str] # TODO: needed? pages: Optional[str] first_page: Optional[str] - first_page_int: Optional[str] # TODO: needed? + first_page_int: Optional[str] # TODO: needed? number: Optional[str] doi: Optional[str] @@ -93,6 +94,7 @@ class ScholarBiblio(BaseModel): contrib_names: List[str] affiliations: List[str] + class ScholarFulltext(BaseModel): lang_code: Optional[str] body: str @@ -106,6 +108,7 @@ class ScholarFulltext(BaseModel): access_url: Optional[str] access_type: Optional[AccessType] + class ScholarRelease(BaseModel): ident: Optional[str] revision: Optional[str] @@ -133,16 +136,19 @@ class ScholarRelease(BaseModel): container_issnl: Optional[str] container_type: Optional[str] + class ScholarSim(BaseModel): issue_item: str pub_collection: str sim_pubid: str first_page: Optional[str] + class ScholarAbstract(BaseModel): body: str lang_code: Optional[str] + class ScholarAccess(BaseModel): access_type: AccessType access_url: str @@ -150,9 +156,10 @@ class ScholarAccess(BaseModel): file_ident: Optional[str] release_ident: Optional[str] + class ScholarDoc(BaseModel): key: str - doc_type: str # enum: work or page + doc_type: str # enum: work or page doc_index_ts: datetime.datetime work_ident: Optional[str] tags: List[str] = [] @@ -164,29 +171,33 @@ class ScholarDoc(BaseModel): releases: List[ScholarRelease] access: List[ScholarAccess] + def doi_split_prefix(doi: str) -> str: - return doi.split('/')[0] + return doi.split("/")[0] + def release_doi_registrar(release: ReleaseEntity) -> Optional[str]: if not release.ext_ids.doi or not release.extra: return None - for registrar in ('crossref', 'datacite', 'jalc'): + for registrar in ("crossref", "datacite", "jalc"): if registrar in release.extra: return registrar # TODO: should we default to Crossref? return None + UNWANTED_ABSTRACT_PREFIXES = [ # roughly sort this long to short - 'Abstract No Abstract ', - 'Publisher Summary ', - 'Abstract ', - 'ABSTRACT ', - 'Summary ', - 'Background: ', - 'Background ', + "Abstract No Abstract ", + "Publisher Summary ", + "Abstract ", + "ABSTRACT ", + "Summary ", + "Background: ", + "Background ", ] + def scrub_text(raw: str, mimetype: str = None) -> str: """ This function takes a mimetype-hinted string and tries to reduce it to a @@ -201,25 +212,26 @@ def scrub_text(raw: str, mimetype: str = None) -> str: text = ftfy.fix_text(raw) # remove HTML - text = BeautifulSoup(text, 'html.parser').get_text() + text = BeautifulSoup(text, "html.parser").get_text() # TODO: for performance, compile these as globals? # Three regexes below adapted from Blendle cleaner.py # https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29 - text = re.sub(r'…', '...', text) - text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text) - text = re.sub(r'[„“]|(\'\')|(,,)', '"', text) - text = re.sub(r'\s+', ' ', text).strip() + text = re.sub(r"…", "...", text) + text = re.sub(r"[`‘’‛⸂⸃⸌⸍⸜⸝]", "'", text) + text = re.sub(r"[„“]|(\'\')|(,,)", '"', text) + text = re.sub(r"\s+", " ", text).strip() # hack to remove abstract prefixes for prefix in UNWANTED_ABSTRACT_PREFIXES: if text.startswith(prefix): - text = text[len(prefix):] + text = text[len(prefix) :] break assert text, "Empty abstract" return text + def contrib_name(contrib: ReleaseContrib) -> str: # TODO: support more cultural normals for name presentation if contrib.raw_name: @@ -231,36 +243,45 @@ def contrib_name(contrib: ReleaseContrib) -> str: else: return contrib.given_name + def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]: # TODO return None + def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]: d = dict() for abst in release.abstracts: if not abst.lang in d: - d[abst.lang] = ScholarAbstract(lang_code=abst.lang, body=scrub_text(abst.content)) + d[abst.lang] = ScholarAbstract( + lang_code=abst.lang, body=scrub_text(abst.content) + ) return list(d.values()) + def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: if release.container: publisher = release.publisher container_name = release.container.name - container_original_name = release.container.extra and release.container.extra.get('original_name') + container_original_name = ( + release.container.extra and release.container.extra.get("original_name") + ) container_ident = release.container.ident container_type = release.container.container_type container_issnl = release.container.issnl - issns = [container_issnl,] - if release.extra.get('issne'): - issns.append(release.extra['issne']) - if release.extra.get('issnp'): - issns.append(release.extra['issnp']) + issns = [ + container_issnl, + ] + if release.extra.get("issne"): + issns.append(release.extra["issne"]) + if release.extra.get("issnp"): + issns.append(release.extra["issnp"]) issns = list(set(issns)) else: - publisher = release.extra.get('publisher') - container_name = release.extra.get('container_name') + publisher = release.extra.get("publisher") + container_name = release.extra.get("container_name") container_original_name = None container_ident = None container_type = None @@ -269,7 +290,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: first_page: Optional[str] = None if release.pages: - first_page = release.pages.split('-')[0] + first_page = release.pages.split("-")[0] first_page_int: Optional[int] = None if first_page and first_page.isdigit(): first_page_int = int(first_page) @@ -285,7 +306,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: release_stage=release.release_stage, withdrawn_status=release.withdrawn_status, lang_code=release.language, - country_code=release.extra and release.extra.get('country'), + country_code=release.extra and release.extra.get("country"), volume=release.volume, volume_int=None, issue=release.issue, @@ -294,7 +315,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: first_page=first_page, first_page_int=None, number=release.number, - doi=release.ext_ids.doi, doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi), doi_registrar=release_doi_registrar(release), @@ -305,7 +325,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: arxiv_id=release.ext_ids.arxiv, jstor_id=release.ext_ids.jstor, mag_id=release.ext_ids.mag, - license_slug=release.license_slug, publisher=publisher, container_name=container_name, @@ -314,14 +333,21 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: container_type=container_type, container_issnl=container_issnl, issns=issns, - # TODO; these filters sort of meh. refactor to be above? - contrib_names=list(filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs])), - contrib_count = len([c for c in release.contribs if c.index]), - affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])), + contrib_names=list( + filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs]) + ), + contrib_count=len([c for c in release.contribs if c.index]), + affiliations=list( + filter( + lambda x: bool(x), + [contrib_affiliation(c) for c in release.contribs if c.index], + ) + ), ) return ret + def es_release_from_release(release: ReleaseEntity) -> ScholarRelease: if release.container: @@ -330,7 +356,7 @@ def es_release_from_release(release: ReleaseEntity) -> ScholarRelease: container_issnl = release.container.issnl container_type = release.container.container_type else: - container_name = release.extra.get('container_name') + container_name = release.extra.get("container_name") container_ident = None container_issnl = None container_type = None diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index d29e03b..5a61f53 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -1,4 +1,3 @@ - """ Helpers to make elasticsearch queries. """ @@ -17,6 +16,7 @@ from typing import List, Dict, Tuple, Optional, Any, Sequence # i18n note: the use of gettext below doesn't actually do the translation here, # it just ensures that the strings are caught by babel for translation later + class FulltextQuery(BaseModel): q: Optional[str] = None limit: Optional[int] = None @@ -76,31 +76,42 @@ class FulltextHits(BaseModel): offset: int limit: int deep_page_limit: int - query_time_ms: int + query_time_ms: int results: List[Any] -def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> FulltextHits: +def do_fulltext_search( + query: FulltextQuery, deep_page_limit: int = 2000 +) -> FulltextHits: es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND) search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) # Convert raw DOIs to DOI queries - if query.q and len(query.q.split()) == 1 and query.q.startswith("10.") and query.q.count("/") >= 1: + if ( + query.q + and len(query.q.split()) == 1 + and query.q.startswith("10.") + and query.q.count("/") >= 1 + ): search = search.filter("terms", doi=query.q) query.q = "*" # type filters if query.filter_type == "papers": - search = search.filter("terms", type=[ "article-journal", "paper-conference", "chapter", ]) + search = search.filter( + "terms", type=["article-journal", "paper-conference", "chapter",] + ) elif query.filter_type == "reports": - search = search.filter("terms", type=[ "report", "standard", ]) + search = search.filter("terms", type=["report", "standard",]) elif query.filter_type == "datasets": - search = search.filter("terms", type=[ "dataset", "software", ]) + search = search.filter("terms", type=["dataset", "software",]) elif query.filter_type == "everything" or query.filter_type == None: pass else: - raise ValueError(f"Unknown 'filter_type' parameter value: '{query.filter_type}'") + raise ValueError( + f"Unknown 'filter_type' parameter value: '{query.filter_type}'" + ) # time filters if query.filter_time == "past_week": @@ -111,7 +122,9 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful # the later to catch papers which don't have release_date defined year_ago_date = str(datetime.date.today() - datetime.timedelta(days=365)) this_year = datetime.date.today().year - search = search.filter(Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year)) + search = search.filter( + Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year) + ) elif query.filter_time == "since_2000": search = search.filter("range", year=dict(gte=2000)) elif query.filter_time == "before_1925": @@ -119,7 +132,9 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful elif query.filter_time == "all_time" or query.filter_time == None: pass else: - raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'") + raise ValueError( + f"Unknown 'filter_time' parameter value: '{query.filter_time}'" + ) # availability filters if query.filter_availability == "oa": @@ -129,13 +144,15 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful elif query.filter_availability == "fulltext" or query.filter_availability == None: search = search.filter("terms", access_type=["wayback", "ia_file", "ia_sim"]) else: - raise ValueError(f"Unknown 'filter_availability' parameter value: '{query.filter_availability}'") + raise ValueError( + f"Unknown 'filter_availability' parameter value: '{query.filter_availability}'" + ) # we combined several queries to improve scoring. # this query use the fancy built-in query string parser basic_fulltext = Q( - 'query_string', + "query_string", query=query.q, default_operator="AND", analyze_wildcard=True, @@ -150,12 +167,9 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful "everything", ], ) - has_fulltext = Q( - 'terms', - access_type=["ia_sim", "ia_file", "wayback"], - ) + has_fulltext = Q("terms", access_type=["ia_sim", "ia_file", "wayback"],) poor_metadata = Q( - 'bool', + "bool", should=[ # if these fields aren't set, metadata is poor. The more that do # not exist, the stronger the signal. @@ -168,11 +182,7 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful search = search.query( "boosting", - positive=Q( - "bool", - must=basic_fulltext, - should=[has_fulltext], - ), + positive=Q("bool", must=basic_fulltext, should=[has_fulltext],), negative=poor_metadata, negative_boost=0.5, ) @@ -201,15 +211,15 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful # Avoid deep paging problem. offset = deep_page_limit - search = search[offset:offset+limit] + search = search[offset : offset + limit] try: resp = search.execute() except elasticsearch.exceptions.RequestError as e: # this is a "user" error print("elasticsearch 400: " + str(e.info), file=sys.stderr) - if e.info.get('error', {}).get('root_cause', {}): - raise ValueError(str(e.info['error']['root_cause'][0].get('reason'))) + if e.info.get("error", {}).get("root_cause", {}): + raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) else: raise ValueError(str(e.info)) except elasticsearch.exceptions.TransportError as e: @@ -221,12 +231,12 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful results = [] for h in resp: r = h._d_ - #print(json.dumps(h.meta._d_, indent=2)) - r['_highlights'] = [] - if 'highlight' in dir(h.meta): + # print(json.dumps(h.meta._d_, indent=2)) + r["_highlights"] = [] + if "highlight" in dir(h.meta): highlights = h.meta.highlight._d_ for k in highlights: - r['_highlights'] += highlights[k] + r["_highlights"] += highlights[k] results.append(r) for h in results: @@ -235,7 +245,7 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful # "Crimes against Unicode"; production workaround for key in h: if type(h[key]) is str: - h[key] = h[key].encode('utf8', 'ignore').decode('utf8') + h[key] = h[key].encode("utf8", "ignore").decode("utf8") return FulltextHits( count_returned=len(results), diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index 6b52535..b84ac47 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -1,4 +1,3 @@ - import os import io import sys @@ -12,9 +11,17 @@ import internetarchive from fatcat_scholar.api_entities import * from fatcat_scholar.djvu import djvu_extract_leaf_texts -from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient +from fatcat_scholar.sandcrawler import ( + SandcrawlerPostgrestClient, + SandcrawlerMinioClient, +) from fatcat_scholar.issue_db import IssueDB, SimIssueRow -from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle +from fatcat_scholar.schema import ( + es_biblio_from_release, + es_release_from_release, + DocType, + IntermediateBundle, +) def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]: @@ -23,26 +30,27 @@ def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]: collection, and simplifies it by removing fields. Motivation is to make intermediate bundle files smaller. """ - full.pop('files') - if 'ulrichs' in full and full['ulrichs']: - full['ulrichs'][0].pop('reviews_mfl') - full['ulrichs'][0].pop('editorial_description') + full.pop("files") + if "ulrichs" in full and full["ulrichs"]: + full["ulrichs"][0].pop("reviews_mfl") + full["ulrichs"][0].pop("editorial_description") # these are interesting, but just too long - full['ulrichs'][0].pop('online_availability_full_text') - full['ulrichs'][0].pop('abstracting_indexing') - full['ulrichs'][0].pop('publisher_and_ordering_details') + full["ulrichs"][0].pop("online_availability_full_text") + full["ulrichs"][0].pop("abstracting_indexing") + full["ulrichs"][0].pop("publisher_and_ordering_details") return full + def truncate_issue_meta(full: Dict[str, Any]) -> Dict[str, Any]: """ Same as truncate_pub_meta() but for issue item metadata """ - full.pop('files') + full.pop("files") return full -class SimPipeline(): +class SimPipeline: def __init__(self, issue_db: IssueDB): self.issue_db: IssueDB = issue_db self.ia_client = internetarchive.get_session() @@ -60,44 +68,50 @@ class SimPipeline(): issue_item_metadata """ # fetch full metadata from API - issue_meta = self.ia_client.get_metadata(issue_db_row['issue_item']) - pub_meta = self.ia_client.get_metadata(issue_db_row['pub_collection']) + issue_meta = self.ia_client.get_metadata(issue_db_row["issue_item"]) + pub_meta = self.ia_client.get_metadata(issue_db_row["pub_collection"]) leaf_index = dict() leaf_list = [] - if not 'page_numbers' in issue_meta: + if not "page_numbers" in issue_meta: # TODO: warn return None - for entry in issue_meta['page_numbers'].get('pages', []): - page_num = entry['pageNumber'] - leaf_index[entry['leafNum']] = page_num + for entry in issue_meta["page_numbers"].get("pages", []): + page_num = entry["pageNumber"] + leaf_index[entry["leafNum"]] = page_num if not (page_num and page_num.isdigit()): continue page_num = int(page_num) - leaf_list.append(entry['leafNum']) + leaf_list.append(entry["leafNum"]) if not leaf_list: return None page_texts: List[Dict[str, Any]] = [] - issue_item = self.ia_client.get_item(issue_db_row['issue_item']) - issue_item_djvu = issue_item.get_file(issue_db_row['issue_item'] + "_djvu.xml") + issue_item = self.ia_client.get_item(issue_db_row["issue_item"]) + issue_item_djvu = issue_item.get_file(issue_db_row["issue_item"] + "_djvu.xml") # override 'close()' method so we can still read out contents djvu_bytes = io.BytesIO() - djvu_bytes.close = lambda: None # type: ignore + djvu_bytes.close = lambda: None # type: ignore assert issue_item_djvu.download(fileobj=djvu_bytes) == True djvu_bytes.seek(0) djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8")) - del(djvu_bytes) + del djvu_bytes leaf_dict = djvu_extract_leaf_texts(djvu_xml) for leaf_num, raw_text in leaf_dict.items(): - page_texts.append(dict(page_num=leaf_index.get(leaf_num), leaf_num=leaf_num, raw_text=raw_text)) + page_texts.append( + dict( + page_num=leaf_index.get(leaf_num), + leaf_num=leaf_num, + raw_text=raw_text, + ) + ) return dict( - issue_item=issue_db_row['issue_item'], + issue_item=issue_db_row["issue_item"], pages=None, page_texts=page_texts, release_ident=None, @@ -109,10 +123,14 @@ class SimPipeline(): count = 0 self.issue_db.db.row_factory = sqlite3.Row cur = self.issue_db.db.cursor() - for row in cur.execute('SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3'): + for row in cur.execute( + "SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3" + ): # filter out "contents" and "index" items # TODO: more filters; also redundant with IssueDB code? - if row['issue_item'].endswith('_contents') or row['issue_item'].endswith('_index'): + if row["issue_item"].endswith("_contents") or row["issue_item"].endswith( + "_index" + ): continue try: full_issue = self.fetch_sim_issue(row) @@ -124,7 +142,7 @@ class SimPipeline(): continue if not full_issue: continue - for leaf in full_issue['page_texts']: + for leaf in full_issue["page_texts"]: bundle = IntermediateBundle( doc_type=DocType.sim_page, releases=[], @@ -132,13 +150,13 @@ class SimPipeline(): grobid_fulltext=None, pdftotext_fulltext=None, sim_fulltext=dict( - issue_item=full_issue['issue_item'], - pages=str(leaf['page_num']), + issue_item=full_issue["issue_item"], + pages=str(leaf["page_num"]), page_texts=[leaf], release_ident=None, - pub_item_metadata=full_issue['pub_item_metadata'], - issue_item_metadata=full_issue['issue_item_metadata'], - ) + pub_item_metadata=full_issue["pub_item_metadata"], + issue_item_metadata=full_issue["issue_item_metadata"], + ), ) print(bundle.json()) count += 1 @@ -147,6 +165,7 @@ class SimPipeline(): if limit is not None and count >= limit: break + def main(): """ Run this command like: @@ -155,20 +174,20 @@ def main(): """ parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) subparsers = parser.add_subparsers() - parser.add_argument("--issue-db-file", + parser.add_argument( + "--issue-db-file", help="sqlite3 database file to open", - default='data/issue_db.sqlite', - type=str) + default="data/issue_db.sqlite", + type=str, + ) - sub = subparsers.add_parser('run_issue_db', - help="iterates through entire IssueDB") - sub.set_defaults(func='run_issue_db') - sub.add_argument("--limit", - help="maximum number of pages to index", - type=int) + sub = subparsers.add_parser("run_issue_db", help="iterates through entire IssueDB") + sub.set_defaults(func="run_issue_db") + sub.add_argument("--limit", help="maximum number of pages to index", type=int) args = parser.parse_args() if not args.__dict__.get("func"): @@ -177,11 +196,12 @@ def main(): sp = SimPipeline(issue_db=IssueDB(args.issue_db_file)) - if args.func == 'run_issue_db': + if args.func == "run_issue_db": sp.run_issue_db(limit=args.limit) else: func = getattr(sp, args.func) func() -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 953ebff..b5a0223 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -1,4 +1,3 @@ - import os import io import sys @@ -10,83 +9,89 @@ import internetarchive from fatcat_scholar.api_entities import * from fatcat_scholar.djvu import djvu_extract_leaf_texts -from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient +from fatcat_scholar.sandcrawler import ( + SandcrawlerPostgrestClient, + SandcrawlerMinioClient, +) from fatcat_scholar.issue_db import IssueDB, SimIssueRow from fatcat_scholar.schema import * from fatcat_scholar.grobid2json import teixml2json def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]: - if not sim['page_texts']: + if not sim["page_texts"]: return None - first_page = sim['page_texts'][0]['page_num'] - issue_item = sim['issue_item'] + first_page = sim["page_texts"][0]["page_num"] + issue_item = sim["issue_item"] return ScholarFulltext( - lang_code=None, # TODO: pub/issue metadata? or langdetect? - body="\n".join([p['raw_text'] for p in sim['page_texts']]), - #acknowledgement=None, - #annex=None, - release_ident=sim.get('release_ident'), - #file_ident=None, - #file_sha1=None, - #file_mimetype=None, + lang_code=None, # TODO: pub/issue metadata? or langdetect? + body="\n".join([p["raw_text"] for p in sim["page_texts"]]), + # acknowledgement=None, + # annex=None, + release_ident=sim.get("release_ident"), + # file_ident=None, + # file_sha1=None, + # file_mimetype=None, thumbnail_url=f"https://archive.org/serve/{issue_item}/__ia_thumb.jpg", access_url=f"https://archive.org/details/{issue_item}/page/{first_page}", access_type=AccessType.ia_sim, ) + def es_sim_from_sim(sim: Dict[str, Any]) -> ScholarSim: first_page = None - if sim['page_texts']: - first_page = sim['page_texts'][0]['page_num'] + if sim["page_texts"]: + first_page = sim["page_texts"][0]["page_num"] return ScholarSim( - issue_item=sim['issue_item'], - pub_collection=sim['pub_item_metadata']['metadata']['identifier'], - sim_pubid=sim['issue_item_metadata']['metadata']['sim_pubid'], + issue_item=sim["issue_item"], + pub_collection=sim["pub_item_metadata"]["metadata"]["identifier"], + sim_pubid=sim["issue_item_metadata"]["metadata"]["sim_pubid"], first_page=first_page, ) + SIM_RELEASE_TYPE_MAP = { - 'Scholarly Journals': 'article-journal', + "Scholarly Journals": "article-journal", # TODO: } SIM_LANG_MAP = { - 'English': 'en', + "English": "en", # TODO: } SIM_COUNTRY_MAP = { - 'Netherlands': 'nl', + "Netherlands": "nl", # TODO: } + def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: - issue_meta = sim['issue_item_metadata']['metadata'] - pub_meta = sim['pub_item_metadata']['metadata'] + issue_meta = sim["issue_item_metadata"]["metadata"] + pub_meta = sim["pub_item_metadata"]["metadata"] first_page = None - if sim['page_texts']: - first_page = sim['page_texts'][0]['page_num'] - container_name = sim['pub_item_metadata']['metadata']['title'] + if sim["page_texts"]: + first_page = sim["page_texts"][0]["page_num"] + container_name = sim["pub_item_metadata"]["metadata"]["title"] last_word = container_name.split()[-1] - if len(last_word) == 9 and last_word[4] == '-': + if len(last_word) == 9 and last_word[4] == "-": container_name = container_name[:-10] issns = [] - raw_issn = issue_meta.get('issn') + raw_issn = issue_meta.get("issn") if raw_issn and len(raw_issn) == 9: issns.append(raw_issn) - volume = issue_meta.get('volume') + volume = issue_meta.get("volume") volume_int = None if volume and volume.isdigit(): volume_int = int(volume) - issue = issue_meta.get('issue') + issue = issue_meta.get("issue") issue_int = None if issue and issue.isdigit(): issue_int = int(issue) - date = issue_meta.get('date') + date = issue_meta.get("date") release_year = None if date and len(date) > 4 and date[:4].isdigit(): release_year = int(date[:4]) @@ -96,52 +101,52 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: release_date = date return ScholarBiblio( - #release_ident=release.ident, + # release_ident=release.ident, title=None, - #subtitle=None, - #original_title=release.original_title, + # subtitle=None, + # original_title=release.original_title, release_date=release_date, release_year=release_year, - release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get('pub_type')), - release_stage="published", # as a default - #withdrawn_status=release.withdrawn_status, - lang_code=SIM_LANG_MAP.get(pub_meta.get('language')), - country_code=SIM_COUNTRY_MAP.get(pub_meta.get('country')), + release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get("pub_type")), + release_stage="published", # as a default + # withdrawn_status=release.withdrawn_status, + lang_code=SIM_LANG_MAP.get(pub_meta.get("language")), + country_code=SIM_COUNTRY_MAP.get(pub_meta.get("country")), volume=volume, volume_int=volume_int, issue=issue, issue_int=issue_int, - pages=sim.get('pages'), + pages=sim.get("pages"), first_page=first_page, first_page_int=None, - #number=None, - + # number=None, # no external identifiers - - #license_slug=release.license_slug, - publisher=issue_meta.get('publisher'), + # license_slug=release.license_slug, + publisher=issue_meta.get("publisher"), container_name=container_name, - container_original_name=None, # TODO pass-through - container_ident=None, # TODO: pass-through - container_type=None, # TODO - container_issnl=None, # TODO: pass-through + container_original_name=None, # TODO pass-through + container_ident=None, # TODO: pass-through + container_type=None, # TODO + container_issnl=None, # TODO: pass-through issns=issns, - # no contrib/affiliation info contrib_names=[], affiliations=[], ) -def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity) -> ScholarFulltext: + +def _add_file_release_meta( + fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity +) -> ScholarFulltext: best_url = None best_url_type = None for url in fe.urls: best_url = url.url best_url_type = AccessType.web - if '//archive.org/' in url.url: + if "//archive.org/" in url.url: best_url_type = AccessType.ia_file break - elif '//web.archive.org/' in url.url: + elif "//web.archive.org/" in url.url: best_url_type = AccessType.wayback break if url.rel == "repository": @@ -157,30 +162,36 @@ def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: Fil return fulltext -def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]: +def es_fulltext_from_grobid( + tei_xml: str, re: ReleaseEntity, fe: FileEntity +) -> Optional[ScholarFulltext]: obj = teixml2json(tei_xml) - if not obj.get('body'): + if not obj.get("body"): return None ret = ScholarFulltext( - lang_code=obj.get('lang'), - body=obj.get('body'), - acknowledgement=obj.get('acknowledgement'), - annex=obj.get('annex'), - thumbnail_url=None, # TODO: sandcrawler thumbnails + lang_code=obj.get("lang"), + body=obj.get("body"), + acknowledgement=obj.get("acknowledgement"), + annex=obj.get("annex"), + thumbnail_url=None, # TODO: sandcrawler thumbnails ) return _add_file_release_meta(ret, re, fe) -def es_fulltext_from_pdftotext(pdftotext: Any, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]: + +def es_fulltext_from_pdftotext( + pdftotext: Any, re: ReleaseEntity, fe: FileEntity +) -> Optional[ScholarFulltext]: ret = ScholarFulltext( lang_code=re.language, - body=pdftotext['raw_text'], + body=pdftotext["raw_text"], acknowledgement=None, annex=None, - thumbnail_url=None, # TODO: sandcrawler thumbnails + thumbnail_url=None, # TODO: sandcrawler thumbnails ) return _add_file_release_meta(ret, re, fe) + def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: tags: List[str] = [] @@ -203,7 +214,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: work_ident = heavy.releases[0].work_id key = f"work_{work_ident}" assert heavy.biblio_release_ident - primary_release = [r for r in heavy.releases if r.ident == heavy.biblio_release_ident][0] + primary_release = [ + r for r in heavy.releases if r.ident == heavy.biblio_release_ident + ][0] biblio = es_biblio_from_release(primary_release) # TODO: abstracts from releases also; abstracts_dict; abstracts from GROBID parse @@ -212,19 +225,44 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: raise NotImplementedError(f"doc_type: {heavy.doc_type}") if heavy.grobid_fulltext: - fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0] - fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0] - fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file) + fulltext_release = [ + r + for r in heavy.releases + if r.ident == heavy.grobid_fulltext["release_ident"] + ][0] + fulltext_file = [ + f + for f in fulltext_release.files + if f.ident == heavy.grobid_fulltext["file_ident"] + ][0] + fulltext = es_fulltext_from_grobid( + heavy.grobid_fulltext["tei_xml"], fulltext_release, fulltext_file + ) # hack to pull through thumbnail from local pdftotext - if fulltext and fulltext.file_sha1 and not fulltext.thumbnail_url and heavy.pdftotext_fulltext: + if ( + fulltext + and fulltext.file_sha1 + and not fulltext.thumbnail_url + and heavy.pdftotext_fulltext + ): # https://covid19.fatcat.wiki/fulltext_web/thumbnail/c9/c9e87f843b3cf7dc47881fa3d3ccb4693d7d9521.png fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png" if not fulltext and heavy.pdftotext_fulltext: - fulltext_release = [r for r in heavy.releases if r.ident == heavy.pdftotext_fulltext['release_ident']][0] - fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.pdftotext_fulltext['file_ident']][0] - fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file) + fulltext_release = [ + r + for r in heavy.releases + if r.ident == heavy.pdftotext_fulltext["release_ident"] + ][0] + fulltext_file = [ + f + for f in fulltext_release.files + if f.ident == heavy.pdftotext_fulltext["file_ident"] + ][0] + fulltext = es_fulltext_from_pdftotext( + heavy.pdftotext_fulltext, fulltext_release, fulltext_file + ) # TODO: additional access list access_dict = dict() @@ -246,41 +284,41 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: # tags if biblio.license_slug and biblio.license_slug.lower().startswith("cc-"): - tags.append('oa') + tags.append("oa") if primary_release and primary_release.container: container = primary_release.container if container.extra: - if container.extra.get('doaj'): - tags.append('doaj') - tags.append('oa') - if container.extra.get('road'): - tags.append('road') - tags.append('oa') - if container.extra.get('szczepanski'): - tags.append('szczepanski') - tags.append('oa') - if container.extra.get('ia', {}).get('longtail_oa'): - tags.append('longtail') - tags.append('oa') - if container.extra.get('sherpa_romeo', {}).get('color') == 'white': - tags.append('oa') - if container.extra.get('default_license', '').lower().startswith('cc-'): - tags.append('oa') - if container.extra.get('platform'): + if container.extra.get("doaj"): + tags.append("doaj") + tags.append("oa") + if container.extra.get("road"): + tags.append("road") + tags.append("oa") + if container.extra.get("szczepanski"): + tags.append("szczepanski") + tags.append("oa") + if container.extra.get("ia", {}).get("longtail_oa"): + tags.append("longtail") + tags.append("oa") + if container.extra.get("sherpa_romeo", {}).get("color") == "white": + tags.append("oa") + if container.extra.get("default_license", "").lower().startswith("cc-"): + tags.append("oa") + if container.extra.get("platform"): # scielo, ojs, wordpress, etc - tags.append(container.extra['platform'].lower()) - if biblio.doi_prefix == '10.2307': - tags.append('jstor') + tags.append(container.extra["platform"].lower()) + if biblio.doi_prefix == "10.2307": + tags.append("jstor") # biorxiv/medrxiv hacks if not biblio.container_name and biblio.release_stage != "published": for _, acc in access_dict.items(): if "://www.medrxiv.org/" in acc.access_url: - biblio.container_name = 'medRxiv' + biblio.container_name = "medRxiv" if biblio.release_stage == None: biblio.release_stage = "submitted" elif "://www.biorxiv.org/" in acc.access_url: - biblio.container_name = 'bioRxiv' + biblio.container_name = "bioRxiv" if biblio.release_stage == None: biblio.release_stage = "submitted" tags = list(set(tags)) @@ -291,7 +329,6 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: doc_index_ts=datetime.datetime.utcnow(), work_ident=work_ident, tags=tags, - biblio=biblio, fulltext=fulltext, ia_sim=ia_sim, @@ -300,23 +337,28 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: access=list(access_dict.values()), ) + def run_transform(infile): for line in infile: obj = json.loads(line) heavy = IntermediateBundle( - doc_type=DocType(obj['doc_type']), - releases=[entity_from_json(json.dumps(re), ReleaseEntity) for re in obj['releases']], - biblio_release_ident=obj.get('biblio_release_ident'), - grobid_fulltext=obj.get('grobid_fulltext'), - pdftotext_fulltext=obj.get('pdftotext_fulltext'), - sim_fulltext=obj.get('sim_fulltext'), + doc_type=DocType(obj["doc_type"]), + releases=[ + entity_from_json(json.dumps(re), ReleaseEntity) + for re in obj["releases"] + ], + biblio_release_ident=obj.get("biblio_release_ident"), + grobid_fulltext=obj.get("grobid_fulltext"), + pdftotext_fulltext=obj.get("pdftotext_fulltext"), + sim_fulltext=obj.get("sim_fulltext"), ) es_doc = transform_heavy(heavy) if not es_doc: continue print(es_doc.json()) + def main(): """ Run this command like: @@ -325,25 +367,32 @@ def main(): """ parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) subparsers = parser.add_subparsers() - sub = subparsers.add_parser('run_transform', - help="iterates through 'heavy' intermediate") - sub.set_defaults(func='run_transform') - sub.add_argument("json_file", + sub = subparsers.add_parser( + "run_transform", help="iterates through 'heavy' intermediate" + ) + sub.set_defaults(func="run_transform") + sub.add_argument( + "json_file", help="intermediate globs as JSON-lines", - nargs='?', default=sys.stdin, type=argparse.FileType('r')) + nargs="?", + default=sys.stdin, + type=argparse.FileType("r"), + ) args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do! (try --help)") sys.exit(-1) - if args.func == 'run_transform': + if args.func == "run_transform": run_transform(infile=args.json_file) else: raise NotImplementedError(args.func) -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 2fd8b24..6c8a2e9 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -19,7 +19,9 @@ from fatcat_scholar.search import do_fulltext_search, FulltextQuery, FulltextHit print(f"dynaconf settings: {settings.as_dict()}", file=sys.stderr) I18N_LANG_TRANSLATIONS = ["de", "zh"] -I18N_LANG_OPTIONS = I18N_LANG_TRANSLATIONS + [settings.I18N_LANG_DEFAULT,] +I18N_LANG_OPTIONS = I18N_LANG_TRANSLATIONS + [ + settings.I18N_LANG_DEFAULT, +] class LangPrefix: @@ -32,14 +34,15 @@ class LangPrefix: """ def __init__(self, request: Request): - self.prefix : str = "" - self.code : str = settings.I18N_LANG_DEFAULT + self.prefix: str = "" + self.code: str = settings.I18N_LANG_DEFAULT for lang_option in I18N_LANG_OPTIONS: if request.url.path.startswith(f"/{lang_option}/"): self.prefix = f"/{lang_option}" self.code = lang_option break + class ContentNegotiation: """ Choses a mimetype to return based on Accept header. @@ -49,31 +52,40 @@ class ContentNegotiation: def __init__(self, request: Request): self.mimetype = "text/html" - if request.headers.get('accept', '').startswith('application/json'): + if request.headers.get("accept", "").startswith("application/json"): self.mimetype = "application/json" + api = APIRouter() + @api.get("/", operation_id="get_home") async def home(): return {"endpoints": {"/": "this", "/search": "fulltext search"}} + @api.get("/search", operation_id="get_search") async def search(query: FulltextQuery = Depends(FulltextQuery)): return {"message": "search results would go here, I guess"} + web = APIRouter() + def locale_gettext(translations): def gt(s): return translations.ugettext(s) + return gt + def locale_ngettext(translations): def ngt(s, n): return translations.ungettext(s) + return ngt + def load_i18n_templates(): """ This is a hack to work around lack of per-request translation @@ -90,53 +102,68 @@ def load_i18n_templates(): d = dict() for lang_opt in I18N_LANG_OPTIONS: translations = babel.support.Translations.load( - dirname="fatcat_scholar/translations", - locales=[lang_opt], + dirname="fatcat_scholar/translations", locales=[lang_opt], ) templates = Jinja2Templates( - directory="fatcat_scholar/templates", - extensions=["jinja2.ext.i18n"], + directory="fatcat_scholar/templates", extensions=["jinja2.ext.i18n"], ) templates.env.install_gettext_translations(translations, newstyle=True) templates.env.install_gettext_callables( - locale_gettext(translations), - locale_ngettext(translations), - newstyle=True, + locale_gettext(translations), locale_ngettext(translations), newstyle=True, ) # remove a lot of whitespace in HTML output with these configs templates.env.trim_blocks = True templates.env.istrip_blocks = True # pass-through application settings to be available in templates - templates.env.globals['settings'] = settings + templates.env.globals["settings"] = settings d[lang_opt] = templates return d + i18n_templates = load_i18n_templates() @web.get("/", include_in_schema=False) -async def web_home(request: Request, lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)): +async def web_home( + request: Request, + lang: LangPrefix = Depends(LangPrefix), + content: ContentNegotiation = Depends(ContentNegotiation), +): if content.mimetype == "application/json": return await home() - return i18n_templates[lang.code].TemplateResponse("home.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}) + return i18n_templates[lang.code].TemplateResponse( + "home.html", + {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}, + ) @web.get("/about", include_in_schema=False) async def web_about(request: Request, lang: LangPrefix = Depends(LangPrefix)): - return i18n_templates[lang.code].TemplateResponse("about.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}) + return i18n_templates[lang.code].TemplateResponse( + "about.html", + {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}, + ) @web.get("/help", include_in_schema=False) async def web_help(request: Request, lang: LangPrefix = Depends(LangPrefix)): - return i18n_templates[lang.code].TemplateResponse("help.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}) + return i18n_templates[lang.code].TemplateResponse( + "help.html", + {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}, + ) @web.get("/search", include_in_schema=False) -async def web_search(request: Request, query: FulltextQuery = Depends(FulltextQuery), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)): +async def web_search( + request: Request, + query: FulltextQuery = Depends(FulltextQuery), + lang: LangPrefix = Depends(LangPrefix), + content: ContentNegotiation = Depends(ContentNegotiation), +): if content.mimetype == "application/json": return await search(query) - hits : Optional[FulltextHits] = None + hits: Optional[FulltextHits] = None search_error: Optional[dict] = None status_code: int = 200 if query.q is not None: @@ -182,4 +209,3 @@ for lang_option in I18N_LANG_OPTIONS: app.include_router(api) app.mount("/static", StaticFiles(directory="fatcat_scholar/static"), name="static") - diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 46e40e1..af558a3 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -1,4 +1,3 @@ - import os import io import sys @@ -12,9 +11,17 @@ import internetarchive from fatcat_scholar.api_entities import * from fatcat_scholar.djvu import djvu_extract_leaf_texts -from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient +from fatcat_scholar.sandcrawler import ( + SandcrawlerPostgrestClient, + SandcrawlerMinioClient, +) from fatcat_scholar.issue_db import IssueDB, SimIssueRow, SimPubRow -from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle +from fatcat_scholar.schema import ( + es_biblio_from_release, + es_release_from_release, + DocType, + IntermediateBundle, +) from fatcat_scholar.sim_pipeline import truncate_pub_meta, truncate_issue_meta @@ -25,17 +32,18 @@ def parse_pages(raw: str) -> Tuple[Optional[int], Optional[int]]: first = int(first_raw) if not "-" in raw: return (first, first) - last_raw = raw.split('-')[-1] + last_raw = raw.split("-")[-1] if not last_raw.isdigit(): return (first, first) last = int(last_raw) if last < first: - last_munge = first_raw[0:(len(first_raw)-len(last_raw))] + last_raw + last_munge = first_raw[0 : (len(first_raw) - len(last_raw))] + last_raw last = int(last_munge) if last < first: return (first, first) return (first, last) + def test_parse_pages(): assert parse_pages("479-89") == (479, 489) assert parse_pages("466-7") == (466, 467) @@ -52,24 +60,33 @@ def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]: Returns a list of release idents in preference order (best first) to try and find fulltext for. """ - releases_sorted = sorted(releases, reverse=True, key=lambda r: ( - r.release_stage=="updated", - r.release_stage=="published", - r.volume is not None, - r.container_id is not None, - r.ext_ids.pmid is not None, - r.release_stage=="submitted", - r.release_type is not None, - r.release_year, - r.release_date, - r.version, - )) + releases_sorted = sorted( + releases, + reverse=True, + key=lambda r: ( + r.release_stage == "updated", + r.release_stage == "published", + r.volume is not None, + r.container_id is not None, + r.ext_ids.pmid is not None, + r.release_stage == "submitted", + r.release_type is not None, + r.release_year, + r.release_date, + r.version, + ), + ) return [r.ident for r in releases_sorted] -class WorkPipeline(): - - def __init__(self, issue_db: IssueDB, sandcrawler_db_client: SandcrawlerPostgrestClient, sandcrawler_s3_client: SandcrawlerMinioClient, fulltext_cache_dir=None): +class WorkPipeline: + def __init__( + self, + issue_db: IssueDB, + sandcrawler_db_client: SandcrawlerPostgrestClient, + sandcrawler_s3_client: SandcrawlerMinioClient, + fulltext_cache_dir=None, + ): self.issue_db: IssueDB = issue_db self.ia_client = internetarchive.get_session() self.sandcrawler_db_client = sandcrawler_db_client @@ -87,9 +104,9 @@ class WorkPipeline(): if not fe.urls: return None grobid_meta = self.sandcrawler_db_client.get_grobid(fe.sha1) - if not grobid_meta or grobid_meta['status'] != 'success': + if not grobid_meta or grobid_meta["status"] != "success": return None - #print(grobid_meta) + # print(grobid_meta) try: grobid_xml = self.sandcrawler_s3_client.get_blob( folder="grobid", @@ -98,13 +115,11 @@ class WorkPipeline(): prefix="", bucket="sandcrawler", ) - #print(grobid_xml) + # print(grobid_xml) except minio.error.NoSuchKey: return None return dict( - tei_xml=grobid_xml, - release_ident=release_ident, - file_ident=fe.ident, + tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident, ) def fetch_file_pdftotext(self, fe: FileEntity, release_ident: str) -> Optional[Any]: @@ -115,14 +130,14 @@ class WorkPipeline(): """ # HACK: look for local pdftotext output if self.fulltext_cache_dir: - local_txt_path = f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt" + local_txt_path = ( + f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt" + ) try: - with open(local_txt_path, 'r') as txt_file: + with open(local_txt_path, "r") as txt_file: raw_text = txt_file.read() return dict( - raw_text=raw_text, - release_ident=release_ident, - file_ident=fe.ident, + raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident, ) except FileNotFoundError: pass @@ -144,9 +159,17 @@ class WorkPipeline(): if not sim_pubid: return None - return self.issue_db.lookup_issue(sim_pubid=sim_pubid, volume=release.volume, issue=release.issue) + return self.issue_db.lookup_issue( + sim_pubid=sim_pubid, volume=release.volume, issue=release.issue + ) - def fetch_sim(self, issue_db_row: SimIssueRow, issue_db_pub_row: SimPubRow, pages: str, release_ident: str) -> Optional[Any]: + def fetch_sim( + self, + issue_db_row: SimIssueRow, + issue_db_pub_row: SimPubRow, + pages: str, + release_ident: str, + ) -> Optional[Any]: """ issue_item pages: str @@ -169,17 +192,17 @@ class WorkPipeline(): leaf_index = dict() leaf_list = [] - if not 'page_numbers' in issue_meta: + if not "page_numbers" in issue_meta: # TODO: warn return None - for entry in issue_meta['page_numbers'].get('pages', []): - page_num = entry['pageNumber'] - leaf_index[entry['leafNum']] = page_num + for entry in issue_meta["page_numbers"].get("pages", []): + page_num = entry["pageNumber"] + leaf_index[entry["leafNum"]] = page_num if not (page_num and page_num.isdigit()): continue page_num = int(page_num) if page_num >= first_page and page_num <= last_page: - leaf_list.append(entry['leafNum']) + leaf_list.append(entry["leafNum"]) if not leaf_list: return None @@ -190,16 +213,22 @@ class WorkPipeline(): # override 'close()' method so we can still read out contents djvu_bytes = io.BytesIO() - djvu_bytes.close = lambda: None # type: ignore + djvu_bytes.close = lambda: None # type: ignore assert issue_item_djvu.download(fileobj=djvu_bytes) == True djvu_bytes.seek(0) djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8")) - del(djvu_bytes) + del djvu_bytes leaf_dict = djvu_extract_leaf_texts(djvu_xml, only_leaves=leaf_list) for leaf_num, raw_text in leaf_dict.items(): - page_texts.append(dict(page_num=leaf_index.get(leaf_num), leaf_num=leaf_num, raw_text=raw_text)) + page_texts.append( + dict( + page_num=leaf_index.get(leaf_num), + leaf_num=leaf_num, + raw_text=raw_text, + ) + ) return dict( issue_item=issue_db_row.issue_item, @@ -220,7 +249,7 @@ class WorkPipeline(): pref_idents = fulltext_pref_list(releases) release_dict = dict([(r.ident, r) for r in releases]) - #print(f"pref_idents={pref_idents}", file=sys.stderr) + # print(f"pref_idents={pref_idents}", file=sys.stderr) # find best accessible fatcat file grobid_fulltext: Optional[Any] = None @@ -244,12 +273,12 @@ class WorkPipeline(): sim_issue: Optional[Any] = None for ident in pref_idents: release = release_dict[ident] - #print(f"{release.extra}\n{release.pages}", file=sys.stderr) + # print(f"{release.extra}\n{release.pages}", file=sys.stderr) if not release.pages: continue # TODO: in the future, will use release.extra.ia.sim.sim_pubid for lookup sim_issue = self.lookup_sim(release) - #print(f"release_{release.ident}: sim_issue={sim_issue}", file=sys.stderr) + # print(f"release_{release.ident}: sim_issue={sim_issue}", file=sys.stderr) if not sim_issue: continue sim_pub = self.issue_db.lookup_pub(sim_issue.sim_pubid) @@ -257,7 +286,9 @@ class WorkPipeline(): continue # XXX: control flow tweak? try: - sim_fulltext = self.fetch_sim(sim_issue, sim_pub, release.pages, release.ident) + sim_fulltext = self.fetch_sim( + sim_issue, sim_pub, release.pages, release.ident + ) except requests.exceptions.ConnectionError as e: print(str(e), file=sys.stderr) continue @@ -300,13 +331,16 @@ class WorkPipeline(): ib = self.process_release_list(batch) print(ib.json()) batch_work_id = None - batch = [release,] + batch = [ + release, + ] batch_work_id = release.work_id if batch: ib = self.process_release_list(batch) print(ib.json()) + def main(): """ Run this command like: @@ -315,31 +349,46 @@ def main(): """ parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) subparsers = parser.add_subparsers() - parser.add_argument("--issue-db-file", + parser.add_argument( + "--issue-db-file", help="sqlite3 database file to open", - default='data/issue_db.sqlite', - type=str) - parser.add_argument("--sandcrawler-db-api", + default="data/issue_db.sqlite", + type=str, + ) + parser.add_argument( + "--sandcrawler-db-api", help="Sandcrawler Postgrest API endpoint", - default='http://aitio.us.archive.org:3030', - type=str) - parser.add_argument("--sandcrawler-s3-api", + default="http://aitio.us.archive.org:3030", + type=str, + ) + parser.add_argument( + "--sandcrawler-s3-api", help="Sandcrawler S3 (minio/seaweedfs) API endpoint", - default='aitio.us.archive.org:9000', - type=str) + default="aitio.us.archive.org:9000", + type=str, + ) - sub = subparsers.add_parser('run_releases', - help="takes expanded release entity JSON, sorted by work_ident") - sub.set_defaults(func='run_releases') - sub.add_argument("json_file", + sub = subparsers.add_parser( + "run_releases", help="takes expanded release entity JSON, sorted by work_ident" + ) + sub.set_defaults(func="run_releases") + sub.add_argument( + "json_file", help="release entities, as JSON-lines", - nargs='?', default=sys.stdin, type=argparse.FileType('r')) - sub.add_argument("--fulltext-cache-dir", + nargs="?", + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub.add_argument( + "--fulltext-cache-dir", help="path of local directory with pdftotext fulltext (and thumbnails)", - default=None, type=str) + default=None, + type=str, + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -348,20 +397,23 @@ def main(): wp = WorkPipeline( issue_db=IssueDB(args.issue_db_file), - sandcrawler_db_client=SandcrawlerPostgrestClient(api_url=args.sandcrawler_db_api), + sandcrawler_db_client=SandcrawlerPostgrestClient( + api_url=args.sandcrawler_db_api + ), sandcrawler_s3_client=SandcrawlerMinioClient( host_url=args.sandcrawler_s3_api, - access_key=os.environ.get('MINIO_ACCESS_KEY'), - secret_key=os.environ.get('MINIO_SECRET_KEY'), + access_key=os.environ.get("MINIO_ACCESS_KEY"), + secret_key=os.environ.get("MINIO_SECRET_KEY"), ), fulltext_cache_dir=args.fulltext_cache_dir, ) - if args.func == 'run_releases': + if args.func == "run_releases": wp.run_releases(args.json_file) else: func = getattr(wp, args.func) func() -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/tests/test_djvu_parse.py b/tests/test_djvu_parse.py index abf5a4c..1f23de2 100644 --- a/tests/test_djvu_parse.py +++ b/tests/test_djvu_parse.py @@ -1,4 +1,3 @@ - import io from fatcat_scholar.djvu import djvu_extract_leaf_texts @@ -6,10 +5,10 @@ from fatcat_scholar.djvu import djvu_extract_leaf_texts def test_djvu_extract_leaf_texts(): # https://archive.org/details/ERIC_ED441501 - with open('tests/files/ERIC_ED441501_djvu.xml', 'r') as f: + with open("tests/files/ERIC_ED441501_djvu.xml", "r") as f: blob = f.read() - leaves = djvu_extract_leaf_texts(io.StringIO(blob), [3,6]) + leaves = djvu_extract_leaf_texts(io.StringIO(blob), [3, 6]) assert 3 in leaves assert 6 in leaves assert "2. Original cataloging tools" in leaves[3] diff --git a/tests/test_scrub.py b/tests/test_scrub.py index 6c357ae..5929b65 100644 --- a/tests/test_scrub.py +++ b/tests/test_scrub.py @@ -1,4 +1,3 @@ - import pytest from fatcat_scholar.schema import * @@ -6,10 +5,12 @@ from fatcat_scholar.schema import * def test_scrub(): vectors = [ - ('“Please clean this piece… of text</b>„', '"Please clean this piece... of text"'), + ( + "“Please clean this piece… of text</b>„", + '"Please clean this piece... of text"', + ), ("<jats:p>blah", "blah"), ] for raw, fixed in vectors: assert fixed == scrub_text(raw) - diff --git a/tests/test_transform.py b/tests/test_transform.py index 3c29d18..d831f47 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -1,28 +1,34 @@ - import pytest from fatcat_openapi_client import ReleaseEntity from fatcat_scholar.schema import * from fatcat_scholar.api_entities import * + def test_es_release_from_release(): - with open('tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json', 'r') as f: + with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f: release = entity_from_json(f.read(), ReleaseEntity) - + obj = es_release_from_release(release) d = json.loads(obj.json()) - assert obj.ident == release.ident == d['ident'] == "hsmo6p4smrganpb3fndaj2lon4" + assert obj.ident == release.ident == d["ident"] == "hsmo6p4smrganpb3fndaj2lon4" assert obj.doi_registrar == "crossref" assert obj.doi_prefix == "10.7717" + def test_es_biblio_from_release(): - with open('tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json', 'r') as f: + with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f: release = entity_from_json(f.read(), ReleaseEntity) - + obj = es_biblio_from_release(release) d = json.loads(obj.json()) - assert obj.release_ident == release.ident == d['release_ident'] == "hsmo6p4smrganpb3fndaj2lon4" + assert ( + obj.release_ident + == release.ident + == d["release_ident"] + == "hsmo6p4smrganpb3fndaj2lon4" + ) |