1 files changed, 64 insertions, 38 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 10742fb..110991d 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -1,4 +1,3 @@
-
 """
 Originally wrote these as dataclasses using pydantic.dataclasses, but we don't
 get serialization for free with those. This is useful for things like
@@ -22,6 +21,7 @@ class DocType(str, Enum):
     work = "work"
     sim_page = "sim_page"
 
+
 class IntermediateBundle(BaseModel):
     doc_type: DocType
     releases: List[ReleaseEntity]
@@ -47,6 +47,7 @@ class AccessType(str, Enum):
     loginwall = "loginwall"
     shadow = "shadow"
 
+
 class ScholarBiblio(BaseModel):
     release_ident: Optional[str]
     title: Optional[str]
@@ -60,12 +61,12 @@ class ScholarBiblio(BaseModel):
     lang_code: Optional[str]
     country_code: Optional[str]
     volume: Optional[str]
-    volume_int: Optional[str]   # TODO: needed?
+    volume_int: Optional[str]  # TODO: needed?
     issue: Optional[str]
-    issue_int: Optional[str]    # TODO: needed?
+    issue_int: Optional[str]  # TODO: needed?
     pages: Optional[str]
     first_page: Optional[str]
-    first_page_int: Optional[str] # TODO: needed?
+    first_page_int: Optional[str]  # TODO: needed?
     number: Optional[str]
 
     doi: Optional[str]
@@ -93,6 +94,7 @@ class ScholarBiblio(BaseModel):
     contrib_names: List[str]
     affiliations: List[str]
 
+
 class ScholarFulltext(BaseModel):
     lang_code: Optional[str]
     body: str
@@ -106,6 +108,7 @@ class ScholarFulltext(BaseModel):
     access_url: Optional[str]
     access_type: Optional[AccessType]
 
+
 class ScholarRelease(BaseModel):
     ident: Optional[str]
     revision: Optional[str]
@@ -133,16 +136,19 @@ class ScholarRelease(BaseModel):
     container_issnl: Optional[str]
     container_type: Optional[str]
 
+
 class ScholarSim(BaseModel):
     issue_item: str
     pub_collection: str
     sim_pubid: str
     first_page: Optional[str]
 
+
 class ScholarAbstract(BaseModel):
     body: str
     lang_code: Optional[str]
 
+
 class ScholarAccess(BaseModel):
     access_type: AccessType
     access_url: str
@@ -150,9 +156,10 @@ class ScholarAccess(BaseModel):
     file_ident: Optional[str]
     release_ident: Optional[str]
 
+
 class ScholarDoc(BaseModel):
     key: str
-    doc_type: str # enum: work or page
+    doc_type: str  # enum: work or page
     doc_index_ts: datetime.datetime
     work_ident: Optional[str]
     tags: List[str] = []
@@ -164,29 +171,33 @@ class ScholarDoc(BaseModel):
     releases: List[ScholarRelease]
     access: List[ScholarAccess]
 
+
 def doi_split_prefix(doi: str) -> str:
-    return doi.split('/')[0]
+    return doi.split("/")[0]
+
 
 def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
     if not release.ext_ids.doi or not release.extra:
         return None
-    for registrar in ('crossref', 'datacite', 'jalc'):
+    for registrar in ("crossref", "datacite", "jalc"):
         if registrar in release.extra:
             return registrar
     # TODO: should we default to Crossref?
     return None
 
+
 UNWANTED_ABSTRACT_PREFIXES = [
     # roughly sort this long to short
-    'Abstract No Abstract ',
-    'Publisher Summary ',
-    'Abstract ',
-    'ABSTRACT ',
-    'Summary ',
-    'Background: ',
-    'Background ',
+    "Abstract No Abstract ",
+    "Publisher Summary ",
+    "Abstract ",
+    "ABSTRACT ",
+    "Summary ",
+    "Background: ",
+    "Background ",
 ]
 
+
 def scrub_text(raw: str, mimetype: str = None) -> str:
     """
     This function takes a mimetype-hinted string and tries to reduce it to a
@@ -201,25 +212,26 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
     text = ftfy.fix_text(raw)
 
     # remove HTML
-    text = BeautifulSoup(text, 'html.parser').get_text()
+    text = BeautifulSoup(text, "html.parser").get_text()
 
     # TODO: for performance, compile these as globals?
     # Three regexes below adapted from Blendle cleaner.py
     # https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29
-    text = re.sub(r'…', '...', text)
-    text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text)
-    text = re.sub(r'[„“]|(\'\')|(,,)', '"', text)
-    text = re.sub(r'\s+', ' ', text).strip()
+    text = re.sub(r"…", "...", text)
+    text = re.sub(r"[`‘’‛⸂⸃⸌⸍⸜⸝]", "'", text)
+    text = re.sub(r"[„“]|(\'\')|(,,)", '"', text)
+    text = re.sub(r"\s+", " ", text).strip()
 
     # hack to remove abstract prefixes
     for prefix in UNWANTED_ABSTRACT_PREFIXES:
         if text.startswith(prefix):
-            text = text[len(prefix):]
+            text = text[len(prefix) :]
             break
 
     assert text, "Empty abstract"
     return text
 
+
 def contrib_name(contrib: ReleaseContrib) -> str:
     # TODO: support more cultural normals for name presentation
     if contrib.raw_name:
@@ -231,36 +243,45 @@ def contrib_name(contrib: ReleaseContrib) -> str:
     else:
         return contrib.given_name
 
+
 def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
     # TODO
     return None
 
+
 def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
 
     d = dict()
     for abst in release.abstracts:
         if not abst.lang in d:
-            d[abst.lang] = ScholarAbstract(lang_code=abst.lang, body=scrub_text(abst.content))
+            d[abst.lang] = ScholarAbstract(
+                lang_code=abst.lang, body=scrub_text(abst.content)
+            )
     return list(d.values())
 
+
 def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
 
     if release.container:
         publisher = release.publisher
         container_name = release.container.name
-        container_original_name = release.container.extra and release.container.extra.get('original_name')
+        container_original_name = (
+            release.container.extra and release.container.extra.get("original_name")
+        )
         container_ident = release.container.ident
         container_type = release.container.container_type
         container_issnl = release.container.issnl
-        issns = [container_issnl,]
-        if release.extra.get('issne'):
-            issns.append(release.extra['issne'])
-        if release.extra.get('issnp'):
-            issns.append(release.extra['issnp'])
+        issns = [
+            container_issnl,
+        ]
+        if release.extra.get("issne"):
+            issns.append(release.extra["issne"])
+        if release.extra.get("issnp"):
+            issns.append(release.extra["issnp"])
         issns = list(set(issns))
     else:
-        publisher = release.extra.get('publisher')
-        container_name = release.extra.get('container_name')
+        publisher = release.extra.get("publisher")
+        container_name = release.extra.get("container_name")
         container_original_name = None
         container_ident = None
         container_type = None
@@ -269,7 +290,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
 
     first_page: Optional[str] = None
     if release.pages:
-        first_page = release.pages.split('-')[0]
+        first_page = release.pages.split("-")[0]
     first_page_int: Optional[int] = None
     if first_page and first_page.isdigit():
         first_page_int = int(first_page)
@@ -285,7 +306,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
         release_stage=release.release_stage,
         withdrawn_status=release.withdrawn_status,
         lang_code=release.language,
-        country_code=release.extra and release.extra.get('country'),
+        country_code=release.extra and release.extra.get("country"),
         volume=release.volume,
         volume_int=None,
         issue=release.issue,
@@ -294,7 +315,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
         first_page=first_page,
         first_page_int=None,
         number=release.number,
-
         doi=release.ext_ids.doi,
         doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
         doi_registrar=release_doi_registrar(release),
@@ -305,7 +325,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
         arxiv_id=release.ext_ids.arxiv,
         jstor_id=release.ext_ids.jstor,
         mag_id=release.ext_ids.mag,
-
         license_slug=release.license_slug,
         publisher=publisher,
         container_name=container_name,
@@ -314,14 +333,21 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
         container_type=container_type,
         container_issnl=container_issnl,
         issns=issns,
-
         # TODO; these filters sort of meh. refactor to be above?
-        contrib_names=list(filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs])),
-        contrib_count = len([c for c in release.contribs if c.index]),
-        affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])),
+        contrib_names=list(
+            filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs])
+        ),
+        contrib_count=len([c for c in release.contribs if c.index]),
+        affiliations=list(
+            filter(
+                lambda x: bool(x),
+                [contrib_affiliation(c) for c in release.contribs if c.index],
+            )
+        ),
     )
     return ret
 
+
 def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
 
     if release.container:
@@ -330,7 +356,7 @@ def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
         container_issnl = release.container.issnl
         container_type = release.container.container_type
     else:
-        container_name = release.extra.get('container_name')
+        container_name = release.extra.get("container_name")
         container_ident = None
         container_issnl = None
         container_type = None