summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/api_entities.py6
-rw-r--r--fatcat_scholar/djvu.py12
-rwxr-xr-xfatcat_scholar/grobid2json.py174
-rw-r--r--fatcat_scholar/hacks.py6
-rw-r--r--fatcat_scholar/issue_db.py250
-rw-r--r--fatcat_scholar/sandcrawler.py35
-rw-r--r--fatcat_scholar/schema.py102
-rw-r--r--fatcat_scholar/search.py70
-rw-r--r--fatcat_scholar/sim_pipeline.py110
-rw-r--r--fatcat_scholar/transform.py267
-rw-r--r--fatcat_scholar/web.py64
-rw-r--r--fatcat_scholar/work_pipeline.py188
12 files changed, 778 insertions, 506 deletions
diff --git a/fatcat_scholar/api_entities.py b/fatcat_scholar/api_entities.py
index 738c5c8..df24eda 100644
--- a/fatcat_scholar/api_entities.py
+++ b/fatcat_scholar/api_entities.py
@@ -1,10 +1,10 @@
-
import json
import collections
from fatcat_openapi_client import ApiClient
_global_serde_api_client = ApiClient()
+
def entity_to_dict(entity, api_client=None):
"""
Hack to take advantage of the code-generated serialization code.
@@ -19,6 +19,7 @@ def entity_to_dict(entity, api_client=None):
api_client = _global_serde_api_client
return api_client.sanitize_for_serialization(entity)
+
def entity_from_json(json_str, entity_type, api_client=None):
"""
Hack to take advantage of the code-generated deserialization code
@@ -27,10 +28,11 @@ def entity_from_json(json_str, entity_type, api_client=None):
"""
if not api_client:
api_client = _global_serde_api_client
- thing = collections.namedtuple('Thing', ['data'])
+ thing = collections.namedtuple("Thing", ["data"])
thing.data = json_str
return api_client.deserialize(thing, entity_type)
+
def entity_from_dict(obj, entity_type, api_client=None):
json_str = json.dumps(obj)
return entity_from_json(json_str, entity_type, api_client=api_client)
diff --git a/fatcat_scholar/djvu.py b/fatcat_scholar/djvu.py
index b4a0774..ca3e412 100644
--- a/fatcat_scholar/djvu.py
+++ b/fatcat_scholar/djvu.py
@@ -1,9 +1,11 @@
-
from io import StringIO
from typing import List, Dict, Tuple, Optional, Any, Sequence
import xml.etree.ElementTree as ET
-def djvu_extract_leaf_texts(blob: StringIO, only_leaves: Optional[List[int]] = None) -> Dict[int, str]:
+
+def djvu_extract_leaf_texts(
+ blob: StringIO, only_leaves: Optional[List[int]] = None
+) -> Dict[int, str]:
"""
Takes an in-memory djvu XML string (note: not an actual djvu file, just the
IA XML file type), and iterates throug
@@ -21,12 +23,12 @@ def djvu_extract_leaf_texts(blob: StringIO, only_leaves: Optional[List[int]] = N
continue
# <OBJECT data="file://localhost//tmp/derive/ERIC_ED441501//ERIC_ED441501.djvu" height="6545" type="image/x.djvu" usemap="ERIC_ED441501_0002.djvu" width="5048">
- usemap = element.get('usemap')
+ usemap = element.get("usemap")
if not usemap:
continue
leaf_num = None
try:
- leaf_num = int(usemap.replace('.djvu', '').split('_')[-1])
+ leaf_num = int(usemap.replace(".djvu", "").split("_")[-1])
except:
continue
if only_leaves is not None and leaf_num is not None:
@@ -42,7 +44,7 @@ def djvu_extract_leaf_texts(blob: StringIO, only_leaves: Optional[List[int]] = N
if p_text:
paragraph_texts.append(p_text)
page_text = "\n".join(paragraph_texts)
- #print(f"### {leaf_num}\n{page_text}\n")
+ # print(f"### {leaf_num}\n{page_text}\n")
if page_text:
leaf_text[leaf_num] = page_text
element.clear()
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py
index 9c2ffad..57d039e 100755
--- a/fatcat_scholar/grobid2json.py
+++ b/fatcat_scholar/grobid2json.py
@@ -33,52 +33,55 @@ import xml.etree.ElementTree as ET
xml_ns = "http://www.w3.org/XML/1998/namespace"
ns = "http://www.tei-c.org/ns/1.0"
+
def all_authors(elem):
names = []
- for author in elem.findall('.//{%s}author' % ns):
- pn = author.find('./{%s}persName' % ns)
+ for author in elem.findall(".//{%s}author" % ns):
+ pn = author.find("./{%s}persName" % ns)
if not pn:
continue
- given_name = pn.findtext('./{%s}forename' % ns) or None
- surname = pn.findtext('./{%s}surname' % ns) or None
- full_name = ' '.join(pn.itertext())
+ given_name = pn.findtext("./{%s}forename" % ns) or None
+ surname = pn.findtext("./{%s}surname" % ns) or None
+ full_name = " ".join(pn.itertext())
obj = dict(name=full_name)
if given_name:
- obj['given_name'] = given_name
+ obj["given_name"] = given_name
if surname:
- obj['surname'] = surname
- ae = author.find('./{%s}affiliation' % ns)
+ obj["surname"] = surname
+ ae = author.find("./{%s}affiliation" % ns)
if ae:
affiliation = dict()
- for on in ae.findall('./{%s}orgName' % ns):
- affiliation[on.get('type')] = on.text
- addr_e = ae.find('./{%s}address' % ns)
+ for on in ae.findall("./{%s}orgName" % ns):
+ affiliation[on.get("type")] = on.text
+ addr_e = ae.find("./{%s}address" % ns)
if addr_e:
address = dict()
for t in addr_e.getchildren():
- address[t.tag.split('}')[-1]] = t.text
+ address[t.tag.split("}")[-1]] = t.text
if address:
- affiliation['address'] = address
- #affiliation['address'] = {
+ affiliation["address"] = address
+ # affiliation['address'] = {
# 'post_code': addr.findtext('./{%s}postCode' % ns) or None,
# 'settlement': addr.findtext('./{%s}settlement' % ns) or None,
# 'country': addr.findtext('./{%s}country' % ns) or None,
- #}
- obj['affiliation'] = affiliation
+ # }
+ obj["affiliation"] = affiliation
names.append(obj)
return names
def journal_info(elem):
journal = dict()
- journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
- journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
- if journal['publisher'] == '':
- journal['publisher'] = None
- journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
- journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
- journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
+ journal["publisher"] = elem.findtext(
+ ".//{%s}publicationStmt/{%s}publisher" % (ns, ns)
+ )
+ if journal["publisher"] == "":
+ journal["publisher"] = None
+ journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
+ journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
+ journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+ journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
keys = list(journal.keys())
# remove empty/null keys
@@ -90,32 +93,32 @@ def journal_info(elem):
def biblio_info(elem):
ref = dict()
- ref['id'] = elem.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
+ ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
# Title stuff is messy in references...
- ref['title'] = elem.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
- other_title = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
+ ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
+ other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
if other_title:
- if ref['title']:
- ref['journal'] = other_title
+ if ref["title"]:
+ ref["journal"] = other_title
else:
- ref['journal'] = None
- ref['title'] = other_title
- ref['authors'] = all_authors(elem)
- ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
- if ref['publisher'] == '':
- ref['publisher'] = None
+ ref["journal"] = None
+ ref["title"] = other_title
+ ref["authors"] = all_authors(elem)
+ ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns))
+ if ref["publisher"] == "":
+ ref["publisher"] = None
date = elem.find('.//{%s}date[@type="published"]' % ns)
- ref['date'] = (date != None) and date.attrib.get('when')
- ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- ref['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- el = elem.find('.//{%s}ptr[@target]' % ns)
+ ref["date"] = (date != None) and date.attrib.get("when")
+ ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+ ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ el = elem.find(".//{%s}ptr[@target]" % ns)
if el is not None:
- ref['url'] = el.attrib['target']
+ ref["url"] = el.attrib["target"]
# Hand correction
- if ref['url'].endswith(".Lastaccessed"):
- ref['url'] = ref['url'].replace(".Lastaccessed", "")
+ if ref["url"].endswith(".Lastaccessed"):
+ ref["url"] = ref["url"].replace(".Lastaccessed", "")
else:
- ref['url'] = None
+ ref["url"] = None
return ref
@@ -128,48 +131,50 @@ def teixml2json(content, encumbered=True):
info = dict()
- #print(content)
- #print(content.getvalue())
+ # print(content)
+ # print(content.getvalue())
tree = ET.parse(content)
tei = tree.getroot()
- header = tei.find('.//{%s}teiHeader' % ns)
+ header = tei.find(".//{%s}teiHeader" % ns)
if header is None:
raise ValueError("XML does not look like TEI format")
- application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0]
- info['grobid_version'] = application_tag.attrib['version'].strip()
- info['grobid_timestamp'] = application_tag.attrib['when'].strip()
- info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
- info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns)))
- info['journal'] = journal_info(header)
+ application_tag = header.findall(".//{%s}appInfo/{%s}application" % (ns, ns))[0]
+ info["grobid_version"] = application_tag.attrib["version"].strip()
+ info["grobid_timestamp"] = application_tag.attrib["when"].strip()
+ info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
+ info["authors"] = all_authors(
+ header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns))
+ )
+ info["journal"] = journal_info(header)
date = header.find('.//{%s}date[@type="published"]' % ns)
- info['date'] = (date != None) and date.attrib.get('when')
- info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
- info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
- if info['doi']:
- info['doi'] = info['doi'].lower()
+ info["date"] = (date != None) and date.attrib.get("when")
+ info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
+ info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
+ if info["doi"]:
+ info["doi"] = info["doi"].lower()
refs = []
- for (i, bs) in enumerate(tei.findall('.//{%s}listBibl/{%s}biblStruct' % (ns, ns))):
+ for (i, bs) in enumerate(tei.findall(".//{%s}listBibl/{%s}biblStruct" % (ns, ns))):
ref = biblio_info(bs)
- ref['index'] = i
+ ref["index"] = i
refs.append(ref)
- info['citations'] = refs
+ info["citations"] = refs
- text = tei.find('.//{%s}text' % (ns))
- #print(text.attrib)
- if text.attrib.get('{%s}lang' % xml_ns):
- info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang
+ text = tei.find(".//{%s}text" % (ns))
+ # print(text.attrib)
+ if text.attrib.get("{%s}lang" % xml_ns):
+ info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang
if encumbered:
- el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns))
- info['abstract'] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}text/{%s}body' % (ns, ns))
- info['body'] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find(".//{%s}profileDesc/{%s}abstract" % (ns, ns))
+ info["abstract"] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find(".//{%s}text/{%s}body" % (ns, ns))
+ info["body"] = (el or None) and " ".join(el.itertext()).strip()
el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
- info['acknowledgement'] = (el or None) and " ".join(el.itertext()).strip()
+ info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip()
el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
- info['annex'] = (el or None) and " ".join(el.itertext()).strip()
+ info["annex"] = (el or None) and " ".join(el.itertext()).strip()
# remove empty/null keys
keys = list(info.keys())
@@ -178,24 +183,31 @@ def teixml2json(content, encumbered=True):
info.pop(k)
return info
-def main(): # pragma no cover
+
+def main(): # pragma no cover
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="GROBID TEI XML to JSON",
- usage="%(prog)s [options] <teifile>...")
- parser.add_argument("--no-encumbered",
+ usage="%(prog)s [options] <teifile>...",
+ )
+ parser.add_argument(
+ "--no-encumbered",
action="store_true",
- help="don't include ambiguously copyright encumbered fields (eg, abstract, body)")
- parser.add_argument("teifiles", nargs='+')
+ help="don't include ambiguously copyright encumbered fields (eg, abstract, body)",
+ )
+ parser.add_argument("teifiles", nargs="+")
args = parser.parse_args()
for filename in args.teifiles:
- content = open(filename, 'r')
- print(json.dumps(
- teixml2json(content,
- encumbered=(not args.no_encumbered)),
- sort_keys=True))
+ content = open(filename, "r")
+ print(
+ json.dumps(
+ teixml2json(content, encumbered=(not args.no_encumbered)),
+ sort_keys=True,
+ )
+ )
+
-if __name__=='__main__': # pragma no cover
+if __name__ == "__main__": # pragma no cover
main()
diff --git a/fatcat_scholar/hacks.py b/fatcat_scholar/hacks.py
index fc1dacd..710a25f 100644
--- a/fatcat_scholar/hacks.py
+++ b/fatcat_scholar/hacks.py
@@ -1,10 +1,10 @@
-
import typing
import jinja2
from starlette.background import BackgroundTask
from starlette.templating import _TemplateResponse
+
class Jinja2Templates:
"""
This is a patched version of starlette.templating.Jinja2Templates that
@@ -15,7 +15,9 @@ class Jinja2Templates:
assert jinja2 is not None, "jinja2 must be installed to use Jinja2Templates"
self.env = self.get_env(directory, extensions)
- def get_env(self, directory: str, extensions: typing.List[str] = []) -> "jinja2.Environment":
+ def get_env(
+ self, directory: str, extensions: typing.List[str] = []
+ ) -> "jinja2.Environment":
@jinja2.contextfunction
def url_for(context: dict, name: str, **path_params: typing.Any) -> str:
request = context["request"]
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py
index 4f5ff53..12ffa32 100644
--- a/fatcat_scholar/issue_db.py
+++ b/fatcat_scholar/issue_db.py
@@ -1,4 +1,3 @@
-
import sys
import json
import sqlite3
@@ -9,6 +8,7 @@ import fatcat_openapi_client
import elasticsearch
from elasticsearch_dsl import Search, Q
+
@dataclass
class SimPubRow:
sim_pubid: str
@@ -23,7 +23,17 @@ class SimPubRow:
wikidata_qid: Optional[str]
def tuple(self):
- return (self.sim_pubid, self.pub_collection, self.title, self.issn, self.pub_type, self.publisher, self.container_issnl, self.container_ident, self.wikidata_qid)
+ return (
+ self.sim_pubid,
+ self.pub_collection,
+ self.title,
+ self.issn,
+ self.pub_type,
+ self.publisher,
+ self.container_issnl,
+ self.container_ident,
+ self.wikidata_qid,
+ )
@classmethod
def from_tuple(cls, row: Any) -> "SimPubRow":
@@ -39,6 +49,7 @@ class SimPubRow:
wikidata_qid=row[8],
)
+
@dataclass
class SimIssueRow:
"""
@@ -46,6 +57,7 @@ class SimIssueRow:
- distinguish between release count that can do full link with pages, or
just in this year/volume/issue?
"""
+
issue_item: str
sim_pubid: str
year: Optional[int]
@@ -56,7 +68,16 @@ class SimIssueRow:
release_count: Optional[int]
def tuple(self):
- return (self.issue_item, self.sim_pubid, self.year, self.volume, self.issue, self.first_page, self.last_page, self.release_count)
+ return (
+ self.issue_item,
+ self.sim_pubid,
+ self.year,
+ self.volume,
+ self.issue,
+ self.first_page,
+ self.last_page,
+ self.release_count,
+ )
@classmethod
def from_tuple(cls, row: Any) -> "SimIssueRow":
@@ -71,6 +92,7 @@ class SimIssueRow:
release_count=row[7],
)
+
@dataclass
class ReleaseCountsRow:
sim_pubid: str
@@ -80,82 +102,100 @@ class ReleaseCountsRow:
volume: Optional[str]
def tuple(self):
- return (self.sim_pubid, self.year, self.volume, self.year_in_sim, self.release_count)
+ return (
+ self.sim_pubid,
+ self.year,
+ self.volume,
+ self.year_in_sim,
+ self.release_count,
+ )
-def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, issue: str) -> int:
+def es_issue_count(
+ es_client: Any, container_id: str, year: int, volume: str, issue: str
+) -> int:
search = Search(using=es_client, index="fatcat_release")
- search = search\
- .filter("term", container_id=container_id)\
- .filter("term", year=year)\
- .filter("term", volume=volume)\
- .filter("term", issue=issue)\
+ search = (
+ search.filter("term", container_id=container_id)
+ .filter("term", year=year)
+ .filter("term", volume=volume)
+ .filter("term", issue=issue)
.extra(request_cache=True)
+ )
return search.count()
+
def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]:
"""
What is being returned is a list of dicts, each with year, volume, count
keys.
"""
search = Search(using=es_client, index="fatcat_release")
- search = search\
- .filter("term", container_id=container_id)
- search.aggs\
- .bucket('years', 'terms', field="year")\
- .bucket('volumes', 'terms', field="volume")
+ search = search.filter("term", container_id=container_id)
+ search.aggs.bucket("years", "terms", field="year").bucket(
+ "volumes", "terms", field="volume"
+ )
search = search[:0]
res = search.execute()
ret = []
for year in res.aggregations.years.buckets:
for volume in year.volumes.buckets:
ret.append(dict(count=volume.doc_count, year=year.key, volume=volume.key))
- #print(ret[-1])
+ # print(ret[-1])
return ret
-class IssueDB():
+class IssueDB:
def __init__(self, db_file):
"""
To create a temporary database, pass ":memory:" as db_file
"""
- self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+ self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
self._pubid2container_map: Dict[str, Optional[str]] = dict()
self._container2pubid_map: Dict[str, Optional[str]] = dict()
def init_db(self):
- self.db.executescript("""
+ self.db.executescript(
+ """
PRAGMA main.page_size = 4096;
PRAGMA main.cache_size = 20000;
PRAGMA main.locking_mode = EXCLUSIVE;
PRAGMA main.synchronous = OFF;
- """)
- with open('schema/issue_db.sql', 'r') as fschema:
+ """
+ )
+ with open("schema/issue_db.sql", "r") as fschema:
self.db.executescript(fschema.read())
def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None:
if not cur:
cur = self.db.cursor()
- cur.execute("INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)",
- pub.tuple())
+ cur.execute(
+ "INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)", pub.tuple()
+ )
def insert_sim_issue(self, issue: SimIssueRow, cur: Any = None) -> None:
if not cur:
cur = self.db.cursor()
- cur.execute("INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)",
- issue.tuple())
+ cur.execute(
+ "INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)", issue.tuple()
+ )
def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None:
if not cur:
cur = self.db.cursor()
- cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)",
- counts.tuple())
+ cur.execute(
+ "INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)", counts.tuple()
+ )
def pubid2container(self, sim_pubid: str) -> Optional[str]:
if sim_pubid in self._pubid2container_map:
return self._pubid2container_map[sim_pubid]
- row = list(self.db.execute("SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]))
+ row = list(
+ self.db.execute(
+ "SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]
+ )
+ )
if row:
self._pubid2container_map[sim_pubid] = row[0][0]
return row[0][0]
@@ -166,7 +206,12 @@ class IssueDB():
def container2pubid(self, container_ident: str) -> Optional[str]:
if container_ident in self._container2pubid_map:
return self._container2pubid_map[container_ident]
- row = list(self.db.execute("SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;", [container_ident]))
+ row = list(
+ self.db.execute(
+ "SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;",
+ [container_ident],
+ )
+ )
if row:
self._container2pubid_map[container_ident] = row[0][0]
return row[0][0]
@@ -174,14 +219,23 @@ class IssueDB():
self._pubid2container_map[container_ident] = None
return None
- def lookup_issue(self, sim_pubid: str, volume: str, issue: str) -> Optional[SimIssueRow]:
- row = list(self.db.execute("SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;", [sim_pubid, volume, issue]))
+ def lookup_issue(
+ self, sim_pubid: str, volume: str, issue: str
+ ) -> Optional[SimIssueRow]:
+ row = list(
+ self.db.execute(
+ "SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;",
+ [sim_pubid, volume, issue],
+ )
+ )
if not row:
return None
return SimIssueRow.from_tuple(row[0])
def lookup_pub(self, sim_pubid: str) -> Optional[SimPubRow]:
- row = list(self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]))
+ row = list(
+ self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid])
+ )
if not row:
return None
return SimPubRow.from_tuple(row[0])
@@ -196,22 +250,22 @@ class IssueDB():
if not line:
continue
obj = json.loads(line)
- meta = obj['metadata']
- assert "periodicals" in meta['collection']
+ meta = obj["metadata"]
+ assert "periodicals" in meta["collection"]
container: Optional[fatcat_openapi_client.ContainerEntity] = None
- if meta.get('issn'):
+ if meta.get("issn"):
try:
- container = api.lookup_container(issnl=meta['issn'])
+ container = api.lookup_container(issnl=meta["issn"])
except fatcat_openapi_client.ApiException as ae:
if ae.status != 404:
raise ae
row = SimPubRow(
- sim_pubid=meta['sim_pubid'],
- pub_collection=meta['identifier'],
- title=meta['title'],
- issn=meta.get('issn'),
- pub_type=meta.get('pub_type'),
- publisher=meta.get('publisher'),
+ sim_pubid=meta["sim_pubid"],
+ pub_collection=meta["identifier"],
+ title=meta["title"],
+ issn=meta.get("issn"),
+ pub_type=meta.get("pub_type"),
+ publisher=meta.get("publisher"),
container_issnl=container and container.issnl,
container_ident=container and container.ident,
wikidata_qid=container and container.wikidata_qid,
@@ -230,28 +284,32 @@ class IssueDB():
if not line:
continue
obj = json.loads(line)
- meta = obj['metadata']
- assert "periodicals" in meta['collection']
- #pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0]
- issue_item = meta['identifier']
+ meta = obj["metadata"]
+ assert "periodicals" in meta["collection"]
+ # pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0]
+ issue_item = meta["identifier"]
# don't index meta items
# TODO: handle more weird suffixes like "1-2", "_part_1", "_index-contents"
if issue_item.endswith("_index") or issue_item.endswith("_contents"):
continue
- sim_pubid=meta['sim_pubid']
+ sim_pubid = meta["sim_pubid"]
year: Optional[int] = None
- if meta.get('date') and meta['date'][:4].isdigit():
- year = int(meta['date'][:4])
- volume = meta.get('volume')
- issue = meta.get('issue')
+ if meta.get("date") and meta["date"][:4].isdigit():
+ year = int(meta["date"][:4])
+ volume = meta.get("volume")
+ issue = meta.get("issue")
first_page: Optional[int] = None
last_page: Optional[int] = None
- if obj.get('page_numbers'):
- pages = [p['pageNumber'] for p in obj['page_numbers']['pages'] if p['pageNumber']]
+ if obj.get("page_numbers"):
+ pages = [
+ p["pageNumber"]
+ for p in obj["page_numbers"]["pages"]
+ if p["pageNumber"]
+ ]
pages = [int(p) for p in pages if p.isdigit()]
if len(pages):
first_page = min(pages)
@@ -261,7 +319,9 @@ class IssueDB():
if year and volume and issue:
container_id = self.pubid2container(sim_pubid)
if container_id:
- release_count = es_issue_count(es_client, container_id, year, volume, issue)
+ release_count = es_issue_count(
+ es_client, container_id, year, volume, issue
+ )
row = SimIssueRow(
issue_item=issue_item,
@@ -278,17 +338,21 @@ class IssueDB():
self.db.commit()
def load_counts(self, es_client: Any):
- all_pub_containers = list(self.db.execute('SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;'))
+ all_pub_containers = list(
+ self.db.execute(
+ "SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;"
+ )
+ )
cur: Any = self.db.cursor()
for (sim_pubid, container_ident) in all_pub_containers:
aggs = es_container_aggs(es_client, container_ident)
for agg in aggs:
row = ReleaseCountsRow(
sim_pubid=sim_pubid,
- year_in_sim=False, # TODO
- release_count=agg['count'],
- year=agg['year'],
- volume=agg['volume'],
+ year_in_sim=False, # TODO
+ release_count=agg["count"],
+ year=agg["year"],
+ volume=agg["volume"],
)
self.insert_release_counts(row, cur)
cur.close()
@@ -303,35 +367,48 @@ def main():
"""
parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
subparsers = parser.add_subparsers()
- parser.add_argument("--db-file",
+ parser.add_argument(
+ "--db-file",
help="sqlite3 database file to open",
- default='data/issue_db.sqlite',
- type=str)
-
- sub = subparsers.add_parser('init_db',
- help="create sqlite3 output file and tables")
- sub.set_defaults(func='init_db')
-
- sub = subparsers.add_parser('load_pubs',
- help="update container-level stats from JSON file")
- sub.set_defaults(func='load_pubs')
- sub.add_argument("json_file",
+ default="data/issue_db.sqlite",
+ type=str,
+ )
+
+ sub = subparsers.add_parser("init_db", help="create sqlite3 output file and tables")
+ sub.set_defaults(func="init_db")
+
+ sub = subparsers.add_parser(
+ "load_pubs", help="update container-level stats from JSON file"
+ )
+ sub.set_defaults(func="load_pubs")
+ sub.add_argument(
+ "json_file",
help="collection-level metadata, as JSON-lines",
- nargs='?', default=sys.stdin, type=argparse.FileType('r'))
-
- sub = subparsers.add_parser('load_issues',
- help="update item-level stats from JSON file")
- sub.set_defaults(func='load_issues')
- sub.add_argument("json_file",
+ nargs="?",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ sub = subparsers.add_parser(
+ "load_issues", help="update item-level stats from JSON file"
+ )
+ sub.set_defaults(func="load_issues")
+ sub.add_argument(
+ "json_file",
help="item-level metadata, as JSON-lines",
- nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+ nargs="?",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
- sub = subparsers.add_parser('load_counts',
- help="update volume-level stats from elasticsearch endpoint")
- sub.set_defaults(func='load_counts')
+ sub = subparsers.add_parser(
+ "load_counts", help="update volume-level stats from elasticsearch endpoint"
+ )
+ sub.set_defaults(func="load_counts")
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -342,15 +419,16 @@ def main():
api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient())
es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")
- if args.func == 'load_pubs':
+ if args.func == "load_pubs":
idb.load_pubs(args.json_file, api)
- elif args.func == 'load_issues':
+ elif args.func == "load_issues":
idb.load_issues(args.json_file, es_client)
- elif args.func == 'load_counts':
+ elif args.func == "load_counts":
idb.load_counts(es_client)
else:
func = getattr(idb, args.func)
func()
-if __name__=="__main__":
+
+if __name__ == "__main__":
main()
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index db6014f..408682f 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -1,16 +1,15 @@
-
import json
import minio
import requests
from typing import Dict, Optional, Any
-class SandcrawlerPostgrestClient():
+class SandcrawlerPostgrestClient:
def __init__(self, api_url: str):
self.api_url = api_url
-
+
def get_grobid(self, sha1: str) -> Optional[Dict[str, Any]]:
- resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1))
+ resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1))
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
@@ -20,8 +19,13 @@ class SandcrawlerPostgrestClient():
class SandcrawlerMinioClient(object):
-
- def __init__(self, host_url: str, access_key: Optional[str] = None, secret_key: Optional[str] = None, default_bucket: Optional[str] = "sandcrawler"):
+ def __init__(
+ self,
+ host_url: str,
+ access_key: Optional[str] = None,
+ secret_key: Optional[str] = None,
+ default_bucket: Optional[str] = "sandcrawler",
+ ):
"""
host is minio connection string (host:port)
access and secret key are as expected
@@ -34,10 +38,7 @@ class SandcrawlerMinioClient(object):
secret_key=os.environ['MINIO_SECRET_KEY'],
"""
self.mc = minio.Minio(
- host_url,
- access_key=access_key,
- secret_key=secret_key,
- secure=False,
+ host_url, access_key=access_key, secret_key=secret_key, secure=False,
)
self.default_bucket = default_bucket
@@ -48,14 +49,9 @@ class SandcrawlerMinioClient(object):
prefix = ""
assert len(sha1hex) == 40
obj_path = "{}{}/{}/{}/{}{}".format(
- prefix,
- folder,
- sha1hex[0:2],
- sha1hex[2:4],
- sha1hex,
- extension,
+ prefix, folder, sha1hex[0:2], sha1hex[2:4], sha1hex, extension,
)
- return obj_path
+ return obj_path
def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
"""
@@ -67,9 +63,6 @@ class SandcrawlerMinioClient(object):
if not bucket:
bucket = self.default_bucket
assert bucket
- blob = self.mc.get_object(
- bucket,
- obj_path,
- )
+ blob = self.mc.get_object(bucket, obj_path,)
# TODO: optionally verify SHA-1?
return blob.data
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 10742fb..110991d 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -1,4 +1,3 @@
-
"""
Originally wrote these as dataclasses using pydantic.dataclasses, but we don't
get serialization for free with those. This is useful for things like
@@ -22,6 +21,7 @@ class DocType(str, Enum):
work = "work"
sim_page = "sim_page"
+
class IntermediateBundle(BaseModel):
doc_type: DocType
releases: List[ReleaseEntity]
@@ -47,6 +47,7 @@ class AccessType(str, Enum):
loginwall = "loginwall"
shadow = "shadow"
+
class ScholarBiblio(BaseModel):
release_ident: Optional[str]
title: Optional[str]
@@ -60,12 +61,12 @@ class ScholarBiblio(BaseModel):
lang_code: Optional[str]
country_code: Optional[str]
volume: Optional[str]
- volume_int: Optional[str] # TODO: needed?
+ volume_int: Optional[str] # TODO: needed?
issue: Optional[str]
- issue_int: Optional[str] # TODO: needed?
+ issue_int: Optional[str] # TODO: needed?
pages: Optional[str]
first_page: Optional[str]
- first_page_int: Optional[str] # TODO: needed?
+ first_page_int: Optional[str] # TODO: needed?
number: Optional[str]
doi: Optional[str]
@@ -93,6 +94,7 @@ class ScholarBiblio(BaseModel):
contrib_names: List[str]
affiliations: List[str]
+
class ScholarFulltext(BaseModel):
lang_code: Optional[str]
body: str
@@ -106,6 +108,7 @@ class ScholarFulltext(BaseModel):
access_url: Optional[str]
access_type: Optional[AccessType]
+
class ScholarRelease(BaseModel):
ident: Optional[str]
revision: Optional[str]
@@ -133,16 +136,19 @@ class ScholarRelease(BaseModel):
container_issnl: Optional[str]
container_type: Optional[str]
+
class ScholarSim(BaseModel):
issue_item: str
pub_collection: str
sim_pubid: str
first_page: Optional[str]
+
class ScholarAbstract(BaseModel):
body: str
lang_code: Optional[str]
+
class ScholarAccess(BaseModel):
access_type: AccessType
access_url: str
@@ -150,9 +156,10 @@ class ScholarAccess(BaseModel):
file_ident: Optional[str]
release_ident: Optional[str]
+
class ScholarDoc(BaseModel):
key: str
- doc_type: str # enum: work or page
+ doc_type: str # enum: work or page
doc_index_ts: datetime.datetime
work_ident: Optional[str]
tags: List[str] = []
@@ -164,29 +171,33 @@ class ScholarDoc(BaseModel):
releases: List[ScholarRelease]
access: List[ScholarAccess]
+
def doi_split_prefix(doi: str) -> str:
- return doi.split('/')[0]
+ return doi.split("/")[0]
+
def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
if not release.ext_ids.doi or not release.extra:
return None
- for registrar in ('crossref', 'datacite', 'jalc'):
+ for registrar in ("crossref", "datacite", "jalc"):
if registrar in release.extra:
return registrar
# TODO: should we default to Crossref?
return None
+
UNWANTED_ABSTRACT_PREFIXES = [
# roughly sort this long to short
- 'Abstract No Abstract ',
- 'Publisher Summary ',
- 'Abstract ',
- 'ABSTRACT ',
- 'Summary ',
- 'Background: ',
- 'Background ',
+ "Abstract No Abstract ",
+ "Publisher Summary ",
+ "Abstract ",
+ "ABSTRACT ",
+ "Summary ",
+ "Background: ",
+ "Background ",
]
+
def scrub_text(raw: str, mimetype: str = None) -> str:
"""
This function takes a mimetype-hinted string and tries to reduce it to a
@@ -201,25 +212,26 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
text = ftfy.fix_text(raw)
# remove HTML
- text = BeautifulSoup(text, 'html.parser').get_text()
+ text = BeautifulSoup(text, "html.parser").get_text()
# TODO: for performance, compile these as globals?
# Three regexes below adapted from Blendle cleaner.py
# https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29
- text = re.sub(r'…', '...', text)
- text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text)
- text = re.sub(r'[„“]|(\'\')|(,,)', '"', text)
- text = re.sub(r'\s+', ' ', text).strip()
+ text = re.sub(r"…", "...", text)
+ text = re.sub(r"[`‘’‛⸂⸃⸌⸍⸜⸝]", "'", text)
+ text = re.sub(r"[„“]|(\'\')|(,,)", '"', text)
+ text = re.sub(r"\s+", " ", text).strip()
# hack to remove abstract prefixes
for prefix in UNWANTED_ABSTRACT_PREFIXES:
if text.startswith(prefix):
- text = text[len(prefix):]
+ text = text[len(prefix) :]
break
assert text, "Empty abstract"
return text
+
def contrib_name(contrib: ReleaseContrib) -> str:
# TODO: support more cultural normals for name presentation
if contrib.raw_name:
@@ -231,36 +243,45 @@ def contrib_name(contrib: ReleaseContrib) -> str:
else:
return contrib.given_name
+
def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
# TODO
return None
+
def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
d = dict()
for abst in release.abstracts:
if not abst.lang in d:
- d[abst.lang] = ScholarAbstract(lang_code=abst.lang, body=scrub_text(abst.content))
+ d[abst.lang] = ScholarAbstract(
+ lang_code=abst.lang, body=scrub_text(abst.content)
+ )
return list(d.values())
+
def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
if release.container:
publisher = release.publisher
container_name = release.container.name
- container_original_name = release.container.extra and release.container.extra.get('original_name')
+ container_original_name = (
+ release.container.extra and release.container.extra.get("original_name")
+ )
container_ident = release.container.ident
container_type = release.container.container_type
container_issnl = release.container.issnl
- issns = [container_issnl,]
- if release.extra.get('issne'):
- issns.append(release.extra['issne'])
- if release.extra.get('issnp'):
- issns.append(release.extra['issnp'])
+ issns = [
+ container_issnl,
+ ]
+ if release.extra.get("issne"):
+ issns.append(release.extra["issne"])
+ if release.extra.get("issnp"):
+ issns.append(release.extra["issnp"])
issns = list(set(issns))
else:
- publisher = release.extra.get('publisher')
- container_name = release.extra.get('container_name')
+ publisher = release.extra.get("publisher")
+ container_name = release.extra.get("container_name")
container_original_name = None
container_ident = None
container_type = None
@@ -269,7 +290,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
first_page: Optional[str] = None
if release.pages:
- first_page = release.pages.split('-')[0]
+ first_page = release.pages.split("-")[0]
first_page_int: Optional[int] = None
if first_page and first_page.isdigit():
first_page_int = int(first_page)
@@ -285,7 +306,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
release_stage=release.release_stage,
withdrawn_status=release.withdrawn_status,
lang_code=release.language,
- country_code=release.extra and release.extra.get('country'),
+ country_code=release.extra and release.extra.get("country"),
volume=release.volume,
volume_int=None,
issue=release.issue,
@@ -294,7 +315,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
first_page=first_page,
first_page_int=None,
number=release.number,
-
doi=release.ext_ids.doi,
doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
doi_registrar=release_doi_registrar(release),
@@ -305,7 +325,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
arxiv_id=release.ext_ids.arxiv,
jstor_id=release.ext_ids.jstor,
mag_id=release.ext_ids.mag,
-
license_slug=release.license_slug,
publisher=publisher,
container_name=container_name,
@@ -314,14 +333,21 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
container_type=container_type,
container_issnl=container_issnl,
issns=issns,
-
# TODO; these filters sort of meh. refactor to be above?
- contrib_names=list(filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs])),
- contrib_count = len([c for c in release.contribs if c.index]),
- affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])),
+ contrib_names=list(
+ filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs])
+ ),
+ contrib_count=len([c for c in release.contribs if c.index]),
+ affiliations=list(
+ filter(
+ lambda x: bool(x),
+ [contrib_affiliation(c) for c in release.contribs if c.index],
+ )
+ ),
)
return ret
+
def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
if release.container:
@@ -330,7 +356,7 @@ def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
container_issnl = release.container.issnl
container_type = release.container.container_type
else:
- container_name = release.extra.get('container_name')
+ container_name = release.extra.get("container_name")
container_ident = None
container_issnl = None
container_type = None
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index d29e03b..5a61f53 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -1,4 +1,3 @@
-
"""
Helpers to make elasticsearch queries.
"""
@@ -17,6 +16,7 @@ from typing import List, Dict, Tuple, Optional, Any, Sequence
# i18n note: the use of gettext below doesn't actually do the translation here,
# it just ensures that the strings are caught by babel for translation later
+
class FulltextQuery(BaseModel):
q: Optional[str] = None
limit: Optional[int] = None
@@ -76,31 +76,42 @@ class FulltextHits(BaseModel):
offset: int
limit: int
deep_page_limit: int
- query_time_ms: int
+ query_time_ms: int
results: List[Any]
-def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> FulltextHits:
+def do_fulltext_search(
+ query: FulltextQuery, deep_page_limit: int = 2000
+) -> FulltextHits:
es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND)
search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX)
# Convert raw DOIs to DOI queries
- if query.q and len(query.q.split()) == 1 and query.q.startswith("10.") and query.q.count("/") >= 1:
+ if (
+ query.q
+ and len(query.q.split()) == 1
+ and query.q.startswith("10.")
+ and query.q.count("/") >= 1
+ ):
search = search.filter("terms", doi=query.q)
query.q = "*"
# type filters
if query.filter_type == "papers":
- search = search.filter("terms", type=[ "article-journal", "paper-conference", "chapter", ])
+ search = search.filter(
+ "terms", type=["article-journal", "paper-conference", "chapter",]
+ )
elif query.filter_type == "reports":
- search = search.filter("terms", type=[ "report", "standard", ])
+ search = search.filter("terms", type=["report", "standard",])
elif query.filter_type == "datasets":
- search = search.filter("terms", type=[ "dataset", "software", ])
+ search = search.filter("terms", type=["dataset", "software",])
elif query.filter_type == "everything" or query.filter_type == None:
pass
else:
- raise ValueError(f"Unknown 'filter_type' parameter value: '{query.filter_type}'")
+ raise ValueError(
+ f"Unknown 'filter_type' parameter value: '{query.filter_type}'"
+ )
# time filters
if query.filter_time == "past_week":
@@ -111,7 +122,9 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
# the later to catch papers which don't have release_date defined
year_ago_date = str(datetime.date.today() - datetime.timedelta(days=365))
this_year = datetime.date.today().year
- search = search.filter(Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year))
+ search = search.filter(
+ Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year)
+ )
elif query.filter_time == "since_2000":
search = search.filter("range", year=dict(gte=2000))
elif query.filter_time == "before_1925":
@@ -119,7 +132,9 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
elif query.filter_time == "all_time" or query.filter_time == None:
pass
else:
- raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'")
+ raise ValueError(
+ f"Unknown 'filter_time' parameter value: '{query.filter_time}'"
+ )
# availability filters
if query.filter_availability == "oa":
@@ -129,13 +144,15 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
elif query.filter_availability == "fulltext" or query.filter_availability == None:
search = search.filter("terms", access_type=["wayback", "ia_file", "ia_sim"])
else:
- raise ValueError(f"Unknown 'filter_availability' parameter value: '{query.filter_availability}'")
+ raise ValueError(
+ f"Unknown 'filter_availability' parameter value: '{query.filter_availability}'"
+ )
# we combined several queries to improve scoring.
# this query use the fancy built-in query string parser
basic_fulltext = Q(
- 'query_string',
+ "query_string",
query=query.q,
default_operator="AND",
analyze_wildcard=True,
@@ -150,12 +167,9 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
"everything",
],
)
- has_fulltext = Q(
- 'terms',
- access_type=["ia_sim", "ia_file", "wayback"],
- )
+ has_fulltext = Q("terms", access_type=["ia_sim", "ia_file", "wayback"],)
poor_metadata = Q(
- 'bool',
+ "bool",
should=[
# if these fields aren't set, metadata is poor. The more that do
# not exist, the stronger the signal.
@@ -168,11 +182,7 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
search = search.query(
"boosting",
- positive=Q(
- "bool",
- must=basic_fulltext,
- should=[has_fulltext],
- ),
+ positive=Q("bool", must=basic_fulltext, should=[has_fulltext],),
negative=poor_metadata,
negative_boost=0.5,
)
@@ -201,15 +211,15 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
# Avoid deep paging problem.
offset = deep_page_limit
- search = search[offset:offset+limit]
+ search = search[offset : offset + limit]
try:
resp = search.execute()
except elasticsearch.exceptions.RequestError as e:
# this is a "user" error
print("elasticsearch 400: " + str(e.info), file=sys.stderr)
- if e.info.get('error', {}).get('root_cause', {}):
- raise ValueError(str(e.info['error']['root_cause'][0].get('reason')))
+ if e.info.get("error", {}).get("root_cause", {}):
+ raise ValueError(str(e.info["error"]["root_cause"][0].get("reason")))
else:
raise ValueError(str(e.info))
except elasticsearch.exceptions.TransportError as e:
@@ -221,12 +231,12 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
results = []
for h in resp:
r = h._d_
- #print(json.dumps(h.meta._d_, indent=2))
- r['_highlights'] = []
- if 'highlight' in dir(h.meta):
+ # print(json.dumps(h.meta._d_, indent=2))
+ r["_highlights"] = []
+ if "highlight" in dir(h.meta):
highlights = h.meta.highlight._d_
for k in highlights:
- r['_highlights'] += highlights[k]
+ r["_highlights"] += highlights[k]
results.append(r)
for h in results:
@@ -235,7 +245,7 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
# "Crimes against Unicode"; production workaround
for key in h:
if type(h[key]) is str:
- h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
+ h[key] = h[key].encode("utf8", "ignore").decode("utf8")
return FulltextHits(
count_returned=len(results),
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index 6b52535..b84ac47 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -1,4 +1,3 @@
-
import os
import io
import sys
@@ -12,9 +11,17 @@ import internetarchive
from fatcat_scholar.api_entities import *
from fatcat_scholar.djvu import djvu_extract_leaf_texts
-from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
+from fatcat_scholar.sandcrawler import (
+ SandcrawlerPostgrestClient,
+ SandcrawlerMinioClient,
+)
from fatcat_scholar.issue_db import IssueDB, SimIssueRow
-from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle
+from fatcat_scholar.schema import (
+ es_biblio_from_release,
+ es_release_from_release,
+ DocType,
+ IntermediateBundle,
+)
def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]:
@@ -23,26 +30,27 @@ def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]:
collection, and simplifies it by removing fields. Motivation is to make
intermediate bundle files smaller.
"""
- full.pop('files')
- if 'ulrichs' in full and full['ulrichs']:
- full['ulrichs'][0].pop('reviews_mfl')
- full['ulrichs'][0].pop('editorial_description')
+ full.pop("files")
+ if "ulrichs" in full and full["ulrichs"]:
+ full["ulrichs"][0].pop("reviews_mfl")
+ full["ulrichs"][0].pop("editorial_description")
# these are interesting, but just too long
- full['ulrichs'][0].pop('online_availability_full_text')
- full['ulrichs'][0].pop('abstracting_indexing')
- full['ulrichs'][0].pop('publisher_and_ordering_details')
+ full["ulrichs"][0].pop("online_availability_full_text")
+ full["ulrichs"][0].pop("abstracting_indexing")
+ full["ulrichs"][0].pop("publisher_and_ordering_details")
return full
+
def truncate_issue_meta(full: Dict[str, Any]) -> Dict[str, Any]:
"""
Same as truncate_pub_meta() but for issue item metadata
"""
- full.pop('files')
+ full.pop("files")
return full
-class SimPipeline():
+class SimPipeline:
def __init__(self, issue_db: IssueDB):
self.issue_db: IssueDB = issue_db
self.ia_client = internetarchive.get_session()
@@ -60,44 +68,50 @@ class SimPipeline():
issue_item_metadata
"""
# fetch full metadata from API
- issue_meta = self.ia_client.get_metadata(issue_db_row['issue_item'])
- pub_meta = self.ia_client.get_metadata(issue_db_row['pub_collection'])
+ issue_meta = self.ia_client.get_metadata(issue_db_row["issue_item"])
+ pub_meta = self.ia_client.get_metadata(issue_db_row["pub_collection"])
leaf_index = dict()
leaf_list = []
- if not 'page_numbers' in issue_meta:
+ if not "page_numbers" in issue_meta:
# TODO: warn
return None
- for entry in issue_meta['page_numbers'].get('pages', []):
- page_num = entry['pageNumber']
- leaf_index[entry['leafNum']] = page_num
+ for entry in issue_meta["page_numbers"].get("pages", []):
+ page_num = entry["pageNumber"]
+ leaf_index[entry["leafNum"]] = page_num
if not (page_num and page_num.isdigit()):
continue
page_num = int(page_num)
- leaf_list.append(entry['leafNum'])
+ leaf_list.append(entry["leafNum"])
if not leaf_list:
return None
page_texts: List[Dict[str, Any]] = []
- issue_item = self.ia_client.get_item(issue_db_row['issue_item'])
- issue_item_djvu = issue_item.get_file(issue_db_row['issue_item'] + "_djvu.xml")
+ issue_item = self.ia_client.get_item(issue_db_row["issue_item"])
+ issue_item_djvu = issue_item.get_file(issue_db_row["issue_item"] + "_djvu.xml")
# override 'close()' method so we can still read out contents
djvu_bytes = io.BytesIO()
- djvu_bytes.close = lambda: None # type: ignore
+ djvu_bytes.close = lambda: None # type: ignore
assert issue_item_djvu.download(fileobj=djvu_bytes) == True
djvu_bytes.seek(0)
djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8"))
- del(djvu_bytes)
+ del djvu_bytes
leaf_dict = djvu_extract_leaf_texts(djvu_xml)
for leaf_num, raw_text in leaf_dict.items():
- page_texts.append(dict(page_num=leaf_index.get(leaf_num), leaf_num=leaf_num, raw_text=raw_text))
+ page_texts.append(
+ dict(
+ page_num=leaf_index.get(leaf_num),
+ leaf_num=leaf_num,
+ raw_text=raw_text,
+ )
+ )
return dict(
- issue_item=issue_db_row['issue_item'],
+ issue_item=issue_db_row["issue_item"],
pages=None,
page_texts=page_texts,
release_ident=None,
@@ -109,10 +123,14 @@ class SimPipeline():
count = 0
self.issue_db.db.row_factory = sqlite3.Row
cur = self.issue_db.db.cursor()
- for row in cur.execute('SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3'):
+ for row in cur.execute(
+ "SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3"
+ ):
# filter out "contents" and "index" items
# TODO: more filters; also redundant with IssueDB code?
- if row['issue_item'].endswith('_contents') or row['issue_item'].endswith('_index'):
+ if row["issue_item"].endswith("_contents") or row["issue_item"].endswith(
+ "_index"
+ ):
continue
try:
full_issue = self.fetch_sim_issue(row)
@@ -124,7 +142,7 @@ class SimPipeline():
continue
if not full_issue:
continue
- for leaf in full_issue['page_texts']:
+ for leaf in full_issue["page_texts"]:
bundle = IntermediateBundle(
doc_type=DocType.sim_page,
releases=[],
@@ -132,13 +150,13 @@ class SimPipeline():
grobid_fulltext=None,
pdftotext_fulltext=None,
sim_fulltext=dict(
- issue_item=full_issue['issue_item'],
- pages=str(leaf['page_num']),
+ issue_item=full_issue["issue_item"],
+ pages=str(leaf["page_num"]),
page_texts=[leaf],
release_ident=None,
- pub_item_metadata=full_issue['pub_item_metadata'],
- issue_item_metadata=full_issue['issue_item_metadata'],
- )
+ pub_item_metadata=full_issue["pub_item_metadata"],
+ issue_item_metadata=full_issue["issue_item_metadata"],
+ ),
)
print(bundle.json())
count += 1
@@ -147,6 +165,7 @@ class SimPipeline():
if limit is not None and count >= limit:
break
+
def main():
"""
Run this command like:
@@ -155,20 +174,20 @@ def main():
"""
parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
subparsers = parser.add_subparsers()
- parser.add_argument("--issue-db-file",
+ parser.add_argument(
+ "--issue-db-file",
help="sqlite3 database file to open",
- default='data/issue_db.sqlite',
- type=str)
+ default="data/issue_db.sqlite",
+ type=str,
+ )
- sub = subparsers.add_parser('run_issue_db',
- help="iterates through entire IssueDB")
- sub.set_defaults(func='run_issue_db')
- sub.add_argument("--limit",
- help="maximum number of pages to index",
- type=int)
+ sub = subparsers.add_parser("run_issue_db", help="iterates through entire IssueDB")
+ sub.set_defaults(func="run_issue_db")
+ sub.add_argument("--limit", help="maximum number of pages to index", type=int)
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -177,11 +196,12 @@ def main():
sp = SimPipeline(issue_db=IssueDB(args.issue_db_file))
- if args.func == 'run_issue_db':
+ if args.func == "run_issue_db":
sp.run_issue_db(limit=args.limit)
else:
func = getattr(sp, args.func)
func()
-if __name__=="__main__":
+
+if __name__ == "__main__":
main()
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 953ebff..b5a0223 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -1,4 +1,3 @@
-
import os
import io
import sys
@@ -10,83 +9,89 @@ import internetarchive
from fatcat_scholar.api_entities import *
from fatcat_scholar.djvu import djvu_extract_leaf_texts
-from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
+from fatcat_scholar.sandcrawler import (
+ SandcrawlerPostgrestClient,
+ SandcrawlerMinioClient,
+)
from fatcat_scholar.issue_db import IssueDB, SimIssueRow
from fatcat_scholar.schema import *
from fatcat_scholar.grobid2json import teixml2json
def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]:
- if not sim['page_texts']:
+ if not sim["page_texts"]:
return None
- first_page = sim['page_texts'][0]['page_num']
- issue_item = sim['issue_item']
+ first_page = sim["page_texts"][0]["page_num"]
+ issue_item = sim["issue_item"]
return ScholarFulltext(
- lang_code=None, # TODO: pub/issue metadata? or langdetect?
- body="\n".join([p['raw_text'] for p in sim['page_texts']]),
- #acknowledgement=None,
- #annex=None,
- release_ident=sim.get('release_ident'),
- #file_ident=None,
- #file_sha1=None,
- #file_mimetype=None,
+ lang_code=None, # TODO: pub/issue metadata? or langdetect?
+ body="\n".join([p["raw_text"] for p in sim["page_texts"]]),
+ # acknowledgement=None,
+ # annex=None,
+ release_ident=sim.get("release_ident"),
+ # file_ident=None,
+ # file_sha1=None,
+ # file_mimetype=None,
thumbnail_url=f"https://archive.org/serve/{issue_item}/__ia_thumb.jpg",
access_url=f"https://archive.org/details/{issue_item}/page/{first_page}",
access_type=AccessType.ia_sim,
)
+
def es_sim_from_sim(sim: Dict[str, Any]) -> ScholarSim:
first_page = None
- if sim['page_texts']:
- first_page = sim['page_texts'][0]['page_num']
+ if sim["page_texts"]:
+ first_page = sim["page_texts"][0]["page_num"]
return ScholarSim(
- issue_item=sim['issue_item'],
- pub_collection=sim['pub_item_metadata']['metadata']['identifier'],
- sim_pubid=sim['issue_item_metadata']['metadata']['sim_pubid'],
+ issue_item=sim["issue_item"],
+ pub_collection=sim["pub_item_metadata"]["metadata"]["identifier"],
+ sim_pubid=sim["issue_item_metadata"]["metadata"]["sim_pubid"],
first_page=first_page,
)
+
SIM_RELEASE_TYPE_MAP = {
- 'Scholarly Journals': 'article-journal',
+ "Scholarly Journals": "article-journal",
# TODO:
}
SIM_LANG_MAP = {
- 'English': 'en',
+ "English": "en",
# TODO:
}
SIM_COUNTRY_MAP = {
- 'Netherlands': 'nl',
+ "Netherlands": "nl",
# TODO:
}
+
def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
- issue_meta = sim['issue_item_metadata']['metadata']
- pub_meta = sim['pub_item_metadata']['metadata']
+ issue_meta = sim["issue_item_metadata"]["metadata"]
+ pub_meta = sim["pub_item_metadata"]["metadata"]
first_page = None
- if sim['page_texts']:
- first_page = sim['page_texts'][0]['page_num']
- container_name = sim['pub_item_metadata']['metadata']['title']
+ if sim["page_texts"]:
+ first_page = sim["page_texts"][0]["page_num"]
+ container_name = sim["pub_item_metadata"]["metadata"]["title"]
last_word = container_name.split()[-1]
- if len(last_word) == 9 and last_word[4] == '-':
+ if len(last_word) == 9 and last_word[4] == "-":
container_name = container_name[:-10]
issns = []
- raw_issn = issue_meta.get('issn')
+ raw_issn = issue_meta.get("issn")
if raw_issn and len(raw_issn) == 9:
issns.append(raw_issn)
- volume = issue_meta.get('volume')
+ volume = issue_meta.get("volume")
volume_int = None
if volume and volume.isdigit():
volume_int = int(volume)
- issue = issue_meta.get('issue')
+ issue = issue_meta.get("issue")
issue_int = None
if issue and issue.isdigit():
issue_int = int(issue)
- date = issue_meta.get('date')
+ date = issue_meta.get("date")
release_year = None
if date and len(date) > 4 and date[:4].isdigit():
release_year = int(date[:4])
@@ -96,52 +101,52 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
release_date = date
return ScholarBiblio(
- #release_ident=release.ident,
+ # release_ident=release.ident,
title=None,
- #subtitle=None,
- #original_title=release.original_title,
+ # subtitle=None,
+ # original_title=release.original_title,
release_date=release_date,
release_year=release_year,
- release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get('pub_type')),
- release_stage="published", # as a default
- #withdrawn_status=release.withdrawn_status,
- lang_code=SIM_LANG_MAP.get(pub_meta.get('language')),
- country_code=SIM_COUNTRY_MAP.get(pub_meta.get('country')),
+ release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get("pub_type")),
+ release_stage="published", # as a default
+ # withdrawn_status=release.withdrawn_status,
+ lang_code=SIM_LANG_MAP.get(pub_meta.get("language")),
+ country_code=SIM_COUNTRY_MAP.get(pub_meta.get("country")),
volume=volume,
volume_int=volume_int,
issue=issue,
issue_int=issue_int,
- pages=sim.get('pages'),
+ pages=sim.get("pages"),
first_page=first_page,
first_page_int=None,
- #number=None,
-
+ # number=None,
# no external identifiers
-
- #license_slug=release.license_slug,
- publisher=issue_meta.get('publisher'),
+ # license_slug=release.license_slug,
+ publisher=issue_meta.get("publisher"),
container_name=container_name,
- container_original_name=None, # TODO pass-through
- container_ident=None, # TODO: pass-through
- container_type=None, # TODO
- container_issnl=None, # TODO: pass-through
+ container_original_name=None, # TODO pass-through
+ container_ident=None, # TODO: pass-through
+ container_type=None, # TODO
+ container_issnl=None, # TODO: pass-through
issns=issns,
-
# no contrib/affiliation info
contrib_names=[],
affiliations=[],
)
-def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity) -> ScholarFulltext:
+
+def _add_file_release_meta(
+ fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity
+) -> ScholarFulltext:
best_url = None
best_url_type = None
for url in fe.urls:
best_url = url.url
best_url_type = AccessType.web
- if '//archive.org/' in url.url:
+ if "//archive.org/" in url.url:
best_url_type = AccessType.ia_file
break
- elif '//web.archive.org/' in url.url:
+ elif "//web.archive.org/" in url.url:
best_url_type = AccessType.wayback
break
if url.rel == "repository":
@@ -157,30 +162,36 @@ def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: Fil
return fulltext
-def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+def es_fulltext_from_grobid(
+ tei_xml: str, re: ReleaseEntity, fe: FileEntity
+) -> Optional[ScholarFulltext]:
obj = teixml2json(tei_xml)
- if not obj.get('body'):
+ if not obj.get("body"):
return None
ret = ScholarFulltext(
- lang_code=obj.get('lang'),
- body=obj.get('body'),
- acknowledgement=obj.get('acknowledgement'),
- annex=obj.get('annex'),
- thumbnail_url=None, # TODO: sandcrawler thumbnails
+ lang_code=obj.get("lang"),
+ body=obj.get("body"),
+ acknowledgement=obj.get("acknowledgement"),
+ annex=obj.get("annex"),
+ thumbnail_url=None, # TODO: sandcrawler thumbnails
)
return _add_file_release_meta(ret, re, fe)
-def es_fulltext_from_pdftotext(pdftotext: Any, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]:
+
+def es_fulltext_from_pdftotext(
+ pdftotext: Any, re: ReleaseEntity, fe: FileEntity
+) -> Optional[ScholarFulltext]:
ret = ScholarFulltext(
lang_code=re.language,
- body=pdftotext['raw_text'],
+ body=pdftotext["raw_text"],
acknowledgement=None,
annex=None,
- thumbnail_url=None, # TODO: sandcrawler thumbnails
+ thumbnail_url=None, # TODO: sandcrawler thumbnails
)
return _add_file_release_meta(ret, re, fe)
+
def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
tags: List[str] = []
@@ -203,7 +214,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
work_ident = heavy.releases[0].work_id
key = f"work_{work_ident}"
assert heavy.biblio_release_ident
- primary_release = [r for r in heavy.releases if r.ident == heavy.biblio_release_ident][0]
+ primary_release = [
+ r for r in heavy.releases if r.ident == heavy.biblio_release_ident
+ ][0]
biblio = es_biblio_from_release(primary_release)
# TODO: abstracts from releases also; abstracts_dict; abstracts from GROBID parse
@@ -212,19 +225,44 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
raise NotImplementedError(f"doc_type: {heavy.doc_type}")
if heavy.grobid_fulltext:
- fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0]
- fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0]
- fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file)
+ fulltext_release = [
+ r
+ for r in heavy.releases
+ if r.ident == heavy.grobid_fulltext["release_ident"]
+ ][0]
+ fulltext_file = [
+ f
+ for f in fulltext_release.files
+ if f.ident == heavy.grobid_fulltext["file_ident"]
+ ][0]
+ fulltext = es_fulltext_from_grobid(
+ heavy.grobid_fulltext["tei_xml"], fulltext_release, fulltext_file
+ )
# hack to pull through thumbnail from local pdftotext
- if fulltext and fulltext.file_sha1 and not fulltext.thumbnail_url and heavy.pdftotext_fulltext:
+ if (
+ fulltext
+ and fulltext.file_sha1
+ and not fulltext.thumbnail_url
+ and heavy.pdftotext_fulltext
+ ):
# https://covid19.fatcat.wiki/fulltext_web/thumbnail/c9/c9e87f843b3cf7dc47881fa3d3ccb4693d7d9521.png
fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png"
if not fulltext and heavy.pdftotext_fulltext:
- fulltext_release = [r for r in heavy.releases if r.ident == heavy.pdftotext_fulltext['release_ident']][0]
- fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.pdftotext_fulltext['file_ident']][0]
- fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file)
+ fulltext_release = [
+ r
+ for r in heavy.releases
+ if r.ident == heavy.pdftotext_fulltext["release_ident"]
+ ][0]
+ fulltext_file = [
+ f
+ for f in fulltext_release.files
+ if f.ident == heavy.pdftotext_fulltext["file_ident"]
+ ][0]
+ fulltext = es_fulltext_from_pdftotext(
+ heavy.pdftotext_fulltext, fulltext_release, fulltext_file
+ )
# TODO: additional access list
access_dict = dict()
@@ -246,41 +284,41 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
# tags
if biblio.license_slug and biblio.license_slug.lower().startswith("cc-"):
- tags.append('oa')
+ tags.append("oa")
if primary_release and primary_release.container:
container = primary_release.container
if container.extra:
- if container.extra.get('doaj'):
- tags.append('doaj')
- tags.append('oa')
- if container.extra.get('road'):
- tags.append('road')
- tags.append('oa')
- if container.extra.get('szczepanski'):
- tags.append('szczepanski')
- tags.append('oa')
- if container.extra.get('ia', {}).get('longtail_oa'):
- tags.append('longtail')
- tags.append('oa')
- if container.extra.get('sherpa_romeo', {}).get('color') == 'white':
- tags.append('oa')
- if container.extra.get('default_license', '').lower().startswith('cc-'):
- tags.append('oa')
- if container.extra.get('platform'):
+ if container.extra.get("doaj"):
+ tags.append("doaj")
+ tags.append("oa")
+ if container.extra.get("road"):
+ tags.append("road")
+ tags.append("oa")
+ if container.extra.get("szczepanski"):
+ tags.append("szczepanski")
+ tags.append("oa")
+ if container.extra.get("ia", {}).get("longtail_oa"):
+ tags.append("longtail")
+ tags.append("oa")
+ if container.extra.get("sherpa_romeo", {}).get("color") == "white":
+ tags.append("oa")
+ if container.extra.get("default_license", "").lower().startswith("cc-"):
+ tags.append("oa")
+ if container.extra.get("platform"):
# scielo, ojs, wordpress, etc
- tags.append(container.extra['platform'].lower())
- if biblio.doi_prefix == '10.2307':
- tags.append('jstor')
+ tags.append(container.extra["platform"].lower())
+ if biblio.doi_prefix == "10.2307":
+ tags.append("jstor")
# biorxiv/medrxiv hacks
if not biblio.container_name and biblio.release_stage != "published":
for _, acc in access_dict.items():
if "://www.medrxiv.org/" in acc.access_url:
- biblio.container_name = 'medRxiv'
+ biblio.container_name = "medRxiv"
if biblio.release_stage == None:
biblio.release_stage = "submitted"
elif "://www.biorxiv.org/" in acc.access_url:
- biblio.container_name = 'bioRxiv'
+ biblio.container_name = "bioRxiv"
if biblio.release_stage == None:
biblio.release_stage = "submitted"
tags = list(set(tags))
@@ -291,7 +329,6 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
doc_index_ts=datetime.datetime.utcnow(),
work_ident=work_ident,
tags=tags,
-
biblio=biblio,
fulltext=fulltext,
ia_sim=ia_sim,
@@ -300,23 +337,28 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
access=list(access_dict.values()),
)
+
def run_transform(infile):
for line in infile:
obj = json.loads(line)
heavy = IntermediateBundle(
- doc_type=DocType(obj['doc_type']),
- releases=[entity_from_json(json.dumps(re), ReleaseEntity) for re in obj['releases']],
- biblio_release_ident=obj.get('biblio_release_ident'),
- grobid_fulltext=obj.get('grobid_fulltext'),
- pdftotext_fulltext=obj.get('pdftotext_fulltext'),
- sim_fulltext=obj.get('sim_fulltext'),
+ doc_type=DocType(obj["doc_type"]),
+ releases=[
+ entity_from_json(json.dumps(re), ReleaseEntity)
+ for re in obj["releases"]
+ ],
+ biblio_release_ident=obj.get("biblio_release_ident"),
+ grobid_fulltext=obj.get("grobid_fulltext"),
+ pdftotext_fulltext=obj.get("pdftotext_fulltext"),
+ sim_fulltext=obj.get("sim_fulltext"),
)
es_doc = transform_heavy(heavy)
if not es_doc:
continue
print(es_doc.json())
+
def main():
"""
Run this command like:
@@ -325,25 +367,32 @@ def main():
"""
parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
subparsers = parser.add_subparsers()
- sub = subparsers.add_parser('run_transform',
- help="iterates through 'heavy' intermediate")
- sub.set_defaults(func='run_transform')
- sub.add_argument("json_file",
+ sub = subparsers.add_parser(
+ "run_transform", help="iterates through 'heavy' intermediate"
+ )
+ sub.set_defaults(func="run_transform")
+ sub.add_argument(
+ "json_file",
help="intermediate globs as JSON-lines",
- nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+ nargs="?",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do! (try --help)")
sys.exit(-1)
- if args.func == 'run_transform':
+ if args.func == "run_transform":
run_transform(infile=args.json_file)
else:
raise NotImplementedError(args.func)
-if __name__=="__main__":
+
+if __name__ == "__main__":
main()
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index 2fd8b24..6c8a2e9 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -19,7 +19,9 @@ from fatcat_scholar.search import do_fulltext_search, FulltextQuery, FulltextHit
print(f"dynaconf settings: {settings.as_dict()}", file=sys.stderr)
I18N_LANG_TRANSLATIONS = ["de", "zh"]
-I18N_LANG_OPTIONS = I18N_LANG_TRANSLATIONS + [settings.I18N_LANG_DEFAULT,]
+I18N_LANG_OPTIONS = I18N_LANG_TRANSLATIONS + [
+ settings.I18N_LANG_DEFAULT,
+]
class LangPrefix:
@@ -32,14 +34,15 @@ class LangPrefix:
"""
def __init__(self, request: Request):
- self.prefix : str = ""
- self.code : str = settings.I18N_LANG_DEFAULT
+ self.prefix: str = ""
+ self.code: str = settings.I18N_LANG_DEFAULT
for lang_option in I18N_LANG_OPTIONS:
if request.url.path.startswith(f"/{lang_option}/"):
self.prefix = f"/{lang_option}"
self.code = lang_option
break
+
class ContentNegotiation:
"""
Choses a mimetype to return based on Accept header.
@@ -49,31 +52,40 @@ class ContentNegotiation:
def __init__(self, request: Request):
self.mimetype = "text/html"
- if request.headers.get('accept', '').startswith('application/json'):
+ if request.headers.get("accept", "").startswith("application/json"):
self.mimetype = "application/json"
+
api = APIRouter()
+
@api.get("/", operation_id="get_home")
async def home():
return {"endpoints": {"/": "this", "/search": "fulltext search"}}
+
@api.get("/search", operation_id="get_search")
async def search(query: FulltextQuery = Depends(FulltextQuery)):
return {"message": "search results would go here, I guess"}
+
web = APIRouter()
+
def locale_gettext(translations):
def gt(s):
return translations.ugettext(s)
+
return gt
+
def locale_ngettext(translations):
def ngt(s, n):
return translations.ungettext(s)
+
return ngt
+
def load_i18n_templates():
"""
This is a hack to work around lack of per-request translation
@@ -90,53 +102,68 @@ def load_i18n_templates():
d = dict()
for lang_opt in I18N_LANG_OPTIONS:
translations = babel.support.Translations.load(
- dirname="fatcat_scholar/translations",
- locales=[lang_opt],
+ dirname="fatcat_scholar/translations", locales=[lang_opt],
)
templates = Jinja2Templates(
- directory="fatcat_scholar/templates",
- extensions=["jinja2.ext.i18n"],
+ directory="fatcat_scholar/templates", extensions=["jinja2.ext.i18n"],
)
templates.env.install_gettext_translations(translations, newstyle=True)
templates.env.install_gettext_callables(
- locale_gettext(translations),
- locale_ngettext(translations),
- newstyle=True,
+ locale_gettext(translations), locale_ngettext(translations), newstyle=True,
)
# remove a lot of whitespace in HTML output with these configs
templates.env.trim_blocks = True
templates.env.istrip_blocks = True
# pass-through application settings to be available in templates
- templates.env.globals['settings'] = settings
+ templates.env.globals["settings"] = settings
d[lang_opt] = templates
return d
+
i18n_templates = load_i18n_templates()
@web.get("/", include_in_schema=False)
-async def web_home(request: Request, lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)):
+async def web_home(
+ request: Request,
+ lang: LangPrefix = Depends(LangPrefix),
+ content: ContentNegotiation = Depends(ContentNegotiation),
+):
if content.mimetype == "application/json":
return await home()
- return i18n_templates[lang.code].TemplateResponse("home.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix})
+ return i18n_templates[lang.code].TemplateResponse(
+ "home.html",
+ {"request": request, "locale": lang.code, "lang_prefix": lang.prefix},
+ )
@web.get("/about", include_in_schema=False)
async def web_about(request: Request, lang: LangPrefix = Depends(LangPrefix)):
- return i18n_templates[lang.code].TemplateResponse("about.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix})
+ return i18n_templates[lang.code].TemplateResponse(
+ "about.html",
+ {"request": request, "locale": lang.code, "lang_prefix": lang.prefix},
+ )
@web.get("/help", include_in_schema=False)
async def web_help(request: Request, lang: LangPrefix = Depends(LangPrefix)):
- return i18n_templates[lang.code].TemplateResponse("help.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix})
+ return i18n_templates[lang.code].TemplateResponse(
+ "help.html",
+ {"request": request, "locale": lang.code, "lang_prefix": lang.prefix},
+ )
@web.get("/search", include_in_schema=False)
-async def web_search(request: Request, query: FulltextQuery = Depends(FulltextQuery), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)):
+async def web_search(
+ request: Request,
+ query: FulltextQuery = Depends(FulltextQuery),
+ lang: LangPrefix = Depends(LangPrefix),
+ content: ContentNegotiation = Depends(ContentNegotiation),
+):
if content.mimetype == "application/json":
return await search(query)
- hits : Optional[FulltextHits] = None
+ hits: Optional[FulltextHits] = None
search_error: Optional[dict] = None
status_code: int = 200
if query.q is not None:
@@ -182,4 +209,3 @@ for lang_option in I18N_LANG_OPTIONS:
app.include_router(api)
app.mount("/static", StaticFiles(directory="fatcat_scholar/static"), name="static")
-
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 46e40e1..af558a3 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -1,4 +1,3 @@
-
import os
import io
import sys
@@ -12,9 +11,17 @@ import internetarchive
from fatcat_scholar.api_entities import *
from fatcat_scholar.djvu import djvu_extract_leaf_texts
-from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
+from fatcat_scholar.sandcrawler import (
+ SandcrawlerPostgrestClient,
+ SandcrawlerMinioClient,
+)
from fatcat_scholar.issue_db import IssueDB, SimIssueRow, SimPubRow
-from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle
+from fatcat_scholar.schema import (
+ es_biblio_from_release,
+ es_release_from_release,
+ DocType,
+ IntermediateBundle,
+)
from fatcat_scholar.sim_pipeline import truncate_pub_meta, truncate_issue_meta
@@ -25,17 +32,18 @@ def parse_pages(raw: str) -> Tuple[Optional[int], Optional[int]]:
first = int(first_raw)
if not "-" in raw:
return (first, first)
- last_raw = raw.split('-')[-1]
+ last_raw = raw.split("-")[-1]
if not last_raw.isdigit():
return (first, first)
last = int(last_raw)
if last < first:
- last_munge = first_raw[0:(len(first_raw)-len(last_raw))] + last_raw
+ last_munge = first_raw[0 : (len(first_raw) - len(last_raw))] + last_raw
last = int(last_munge)
if last < first:
return (first, first)
return (first, last)
+
def test_parse_pages():
assert parse_pages("479-89") == (479, 489)
assert parse_pages("466-7") == (466, 467)
@@ -52,24 +60,33 @@ def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]:
Returns a list of release idents in preference order (best first) to
try and find fulltext for.
"""
- releases_sorted = sorted(releases, reverse=True, key=lambda r: (
- r.release_stage=="updated",
- r.release_stage=="published",
- r.volume is not None,
- r.container_id is not None,
- r.ext_ids.pmid is not None,
- r.release_stage=="submitted",
- r.release_type is not None,
- r.release_year,
- r.release_date,
- r.version,
- ))
+ releases_sorted = sorted(
+ releases,
+ reverse=True,
+ key=lambda r: (
+ r.release_stage == "updated",
+ r.release_stage == "published",
+ r.volume is not None,
+ r.container_id is not None,
+ r.ext_ids.pmid is not None,
+ r.release_stage == "submitted",
+ r.release_type is not None,
+ r.release_year,
+ r.release_date,
+ r.version,
+ ),
+ )
return [r.ident for r in releases_sorted]
-class WorkPipeline():
-
- def __init__(self, issue_db: IssueDB, sandcrawler_db_client: SandcrawlerPostgrestClient, sandcrawler_s3_client: SandcrawlerMinioClient, fulltext_cache_dir=None):
+class WorkPipeline:
+ def __init__(
+ self,
+ issue_db: IssueDB,
+ sandcrawler_db_client: SandcrawlerPostgrestClient,
+ sandcrawler_s3_client: SandcrawlerMinioClient,
+ fulltext_cache_dir=None,
+ ):
self.issue_db: IssueDB = issue_db
self.ia_client = internetarchive.get_session()
self.sandcrawler_db_client = sandcrawler_db_client
@@ -87,9 +104,9 @@ class WorkPipeline():
if not fe.urls:
return None
grobid_meta = self.sandcrawler_db_client.get_grobid(fe.sha1)
- if not grobid_meta or grobid_meta['status'] != 'success':
+ if not grobid_meta or grobid_meta["status"] != "success":
return None
- #print(grobid_meta)
+ # print(grobid_meta)
try:
grobid_xml = self.sandcrawler_s3_client.get_blob(
folder="grobid",
@@ -98,13 +115,11 @@ class WorkPipeline():
prefix="",
bucket="sandcrawler",
)
- #print(grobid_xml)
+ # print(grobid_xml)
except minio.error.NoSuchKey:
return None
return dict(
- tei_xml=grobid_xml,
- release_ident=release_ident,
- file_ident=fe.ident,
+ tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident,
)
def fetch_file_pdftotext(self, fe: FileEntity, release_ident: str) -> Optional[Any]:
@@ -115,14 +130,14 @@ class WorkPipeline():
"""
# HACK: look for local pdftotext output
if self.fulltext_cache_dir:
- local_txt_path = f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt"
+ local_txt_path = (
+ f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt"
+ )
try:
- with open(local_txt_path, 'r') as txt_file:
+ with open(local_txt_path, "r") as txt_file:
raw_text = txt_file.read()
return dict(
- raw_text=raw_text,
- release_ident=release_ident,
- file_ident=fe.ident,
+ raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,
)
except FileNotFoundError:
pass
@@ -144,9 +159,17 @@ class WorkPipeline():
if not sim_pubid:
return None
- return self.issue_db.lookup_issue(sim_pubid=sim_pubid, volume=release.volume, issue=release.issue)
+ return self.issue_db.lookup_issue(
+ sim_pubid=sim_pubid, volume=release.volume, issue=release.issue
+ )
- def fetch_sim(self, issue_db_row: SimIssueRow, issue_db_pub_row: SimPubRow, pages: str, release_ident: str) -> Optional[Any]:
+ def fetch_sim(
+ self,
+ issue_db_row: SimIssueRow,
+ issue_db_pub_row: SimPubRow,
+ pages: str,
+ release_ident: str,
+ ) -> Optional[Any]:
"""
issue_item
pages: str
@@ -169,17 +192,17 @@ class WorkPipeline():
leaf_index = dict()
leaf_list = []
- if not 'page_numbers' in issue_meta:
+ if not "page_numbers" in issue_meta:
# TODO: warn
return None
- for entry in issue_meta['page_numbers'].get('pages', []):
- page_num = entry['pageNumber']
- leaf_index[entry['leafNum']] = page_num
+ for entry in issue_meta["page_numbers"].get("pages", []):
+ page_num = entry["pageNumber"]
+ leaf_index[entry["leafNum"]] = page_num
if not (page_num and page_num.isdigit()):
continue
page_num = int(page_num)
if page_num >= first_page and page_num <= last_page:
- leaf_list.append(entry['leafNum'])
+ leaf_list.append(entry["leafNum"])
if not leaf_list:
return None
@@ -190,16 +213,22 @@ class WorkPipeline():
# override 'close()' method so we can still read out contents
djvu_bytes = io.BytesIO()
- djvu_bytes.close = lambda: None # type: ignore
+ djvu_bytes.close = lambda: None # type: ignore
assert issue_item_djvu.download(fileobj=djvu_bytes) == True
djvu_bytes.seek(0)
djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8"))
- del(djvu_bytes)
+ del djvu_bytes
leaf_dict = djvu_extract_leaf_texts(djvu_xml, only_leaves=leaf_list)
for leaf_num, raw_text in leaf_dict.items():
- page_texts.append(dict(page_num=leaf_index.get(leaf_num), leaf_num=leaf_num, raw_text=raw_text))
+ page_texts.append(
+ dict(
+ page_num=leaf_index.get(leaf_num),
+ leaf_num=leaf_num,
+ raw_text=raw_text,
+ )
+ )
return dict(
issue_item=issue_db_row.issue_item,
@@ -220,7 +249,7 @@ class WorkPipeline():
pref_idents = fulltext_pref_list(releases)
release_dict = dict([(r.ident, r) for r in releases])
- #print(f"pref_idents={pref_idents}", file=sys.stderr)
+ # print(f"pref_idents={pref_idents}", file=sys.stderr)
# find best accessible fatcat file
grobid_fulltext: Optional[Any] = None
@@ -244,12 +273,12 @@ class WorkPipeline():
sim_issue: Optional[Any] = None
for ident in pref_idents:
release = release_dict[ident]
- #print(f"{release.extra}\n{release.pages}", file=sys.stderr)
+ # print(f"{release.extra}\n{release.pages}", file=sys.stderr)
if not release.pages:
continue
# TODO: in the future, will use release.extra.ia.sim.sim_pubid for lookup
sim_issue = self.lookup_sim(release)
- #print(f"release_{release.ident}: sim_issue={sim_issue}", file=sys.stderr)
+ # print(f"release_{release.ident}: sim_issue={sim_issue}", file=sys.stderr)
if not sim_issue:
continue
sim_pub = self.issue_db.lookup_pub(sim_issue.sim_pubid)
@@ -257,7 +286,9 @@ class WorkPipeline():
continue
# XXX: control flow tweak?
try:
- sim_fulltext = self.fetch_sim(sim_issue, sim_pub, release.pages, release.ident)
+ sim_fulltext = self.fetch_sim(
+ sim_issue, sim_pub, release.pages, release.ident
+ )
except requests.exceptions.ConnectionError as e:
print(str(e), file=sys.stderr)
continue
@@ -300,13 +331,16 @@ class WorkPipeline():
ib = self.process_release_list(batch)
print(ib.json())
batch_work_id = None
- batch = [release,]
+ batch = [
+ release,
+ ]
batch_work_id = release.work_id
if batch:
ib = self.process_release_list(batch)
print(ib.json())
+
def main():
"""
Run this command like:
@@ -315,31 +349,46 @@ def main():
"""
parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
subparsers = parser.add_subparsers()
- parser.add_argument("--issue-db-file",
+ parser.add_argument(
+ "--issue-db-file",
help="sqlite3 database file to open",
- default='data/issue_db.sqlite',
- type=str)
- parser.add_argument("--sandcrawler-db-api",
+ default="data/issue_db.sqlite",
+ type=str,
+ )
+ parser.add_argument(
+ "--sandcrawler-db-api",
help="Sandcrawler Postgrest API endpoint",
- default='http://aitio.us.archive.org:3030',
- type=str)
- parser.add_argument("--sandcrawler-s3-api",
+ default="http://aitio.us.archive.org:3030",
+ type=str,
+ )
+ parser.add_argument(
+ "--sandcrawler-s3-api",
help="Sandcrawler S3 (minio/seaweedfs) API endpoint",
- default='aitio.us.archive.org:9000',
- type=str)
+ default="aitio.us.archive.org:9000",
+ type=str,
+ )
- sub = subparsers.add_parser('run_releases',
- help="takes expanded release entity JSON, sorted by work_ident")
- sub.set_defaults(func='run_releases')
- sub.add_argument("json_file",
+ sub = subparsers.add_parser(
+ "run_releases", help="takes expanded release entity JSON, sorted by work_ident"
+ )
+ sub.set_defaults(func="run_releases")
+ sub.add_argument(
+ "json_file",
help="release entities, as JSON-lines",
- nargs='?', default=sys.stdin, type=argparse.FileType('r'))
- sub.add_argument("--fulltext-cache-dir",
+ nargs="?",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub.add_argument(
+ "--fulltext-cache-dir",
help="path of local directory with pdftotext fulltext (and thumbnails)",
- default=None, type=str)
+ default=None,
+ type=str,
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -348,20 +397,23 @@ def main():
wp = WorkPipeline(
issue_db=IssueDB(args.issue_db_file),
- sandcrawler_db_client=SandcrawlerPostgrestClient(api_url=args.sandcrawler_db_api),
+ sandcrawler_db_client=SandcrawlerPostgrestClient(
+ api_url=args.sandcrawler_db_api
+ ),
sandcrawler_s3_client=SandcrawlerMinioClient(
host_url=args.sandcrawler_s3_api,
- access_key=os.environ.get('MINIO_ACCESS_KEY'),
- secret_key=os.environ.get('MINIO_SECRET_KEY'),
+ access_key=os.environ.get("MINIO_ACCESS_KEY"),
+ secret_key=os.environ.get("MINIO_SECRET_KEY"),
),
fulltext_cache_dir=args.fulltext_cache_dir,
)
- if args.func == 'run_releases':
+ if args.func == "run_releases":
wp.run_releases(args.json_file)
else:
func = getattr(wp, args.func)
func()
-if __name__=="__main__":
+
+if __name__ == "__main__":
main()