summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/djvu.py2
-rwxr-xr-xfatcat_scholar/grobid2json.py4
-rw-r--r--fatcat_scholar/issue_db.py4
-rw-r--r--fatcat_scholar/sandcrawler.py1
-rw-r--r--fatcat_scholar/schema.py2
-rw-r--r--fatcat_scholar/search.py14
-rw-r--r--fatcat_scholar/sim_pipeline.py17
-rw-r--r--fatcat_scholar/transform.py13
-rw-r--r--fatcat_scholar/web.py6
-rw-r--r--fatcat_scholar/work_pipeline.py7
10 files changed, 22 insertions, 48 deletions
diff --git a/fatcat_scholar/djvu.py b/fatcat_scholar/djvu.py
index ca3e412..c715608 100644
--- a/fatcat_scholar/djvu.py
+++ b/fatcat_scholar/djvu.py
@@ -1,5 +1,5 @@
from io import StringIO
-from typing import List, Dict, Tuple, Optional, Any, Sequence
+from typing import List, Dict, Optional
import xml.etree.ElementTree as ET
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py
index 57d039e..4019363 100755
--- a/fatcat_scholar/grobid2json.py
+++ b/fatcat_scholar/grobid2json.py
@@ -108,7 +108,7 @@ def biblio_info(elem):
if ref["publisher"] == "":
ref["publisher"] = None
date = elem.find('.//{%s}date[@type="published"]' % ns)
- ref["date"] = (date != None) and date.attrib.get("when")
+ ref["date"] = (date is not None) and date.attrib.get("when")
ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
el = elem.find(".//{%s}ptr[@target]" % ns)
@@ -148,7 +148,7 @@ def teixml2json(content, encumbered=True):
)
info["journal"] = journal_info(header)
date = header.find('.//{%s}date[@type="published"]' % ns)
- info["date"] = (date != None) and date.attrib.get("when")
+ info["date"] = (date is not None) and date.attrib.get("when")
info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
if info["doi"]:
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py
index 12ffa32..829560b 100644
--- a/fatcat_scholar/issue_db.py
+++ b/fatcat_scholar/issue_db.py
@@ -3,10 +3,10 @@ import json
import sqlite3
import argparse
from dataclasses import dataclass
-from typing import List, Dict, Tuple, Optional, Any, Sequence
+from typing import List, Dict, Optional, Any, Sequence
import fatcat_openapi_client
import elasticsearch
-from elasticsearch_dsl import Search, Q
+from elasticsearch_dsl import Search
@dataclass
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 408682f..347364f 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -1,4 +1,3 @@
-import json
import minio
import requests
from typing import Dict, Optional, Any
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 110991d..29bbe92 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -253,7 +253,7 @@ def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
d = dict()
for abst in release.abstracts:
- if not abst.lang in d:
+ if abst.lang not in d:
d[abst.lang] = ScholarAbstract(
lang_code=abst.lang, body=scrub_text(abst.content)
)
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 5a61f53..3d9ca9b 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -3,15 +3,13 @@ Helpers to make elasticsearch queries.
"""
import sys
-import json
from gettext import gettext
import datetime
import elasticsearch
from pydantic import BaseModel
from dynaconf import settings
-from dataclasses import dataclass
from elasticsearch_dsl import Search, Q
-from typing import List, Dict, Tuple, Optional, Any, Sequence
+from typing import List, Optional, Any
# i18n note: the use of gettext below doesn't actually do the translation here,
# it just ensures that the strings are caught by babel for translation later
@@ -106,7 +104,7 @@ def do_fulltext_search(
search = search.filter("terms", type=["report", "standard",])
elif query.filter_type == "datasets":
search = search.filter("terms", type=["dataset", "software",])
- elif query.filter_type == "everything" or query.filter_type == None:
+ elif query.filter_type == "everything" or query.filter_type is None:
pass
else:
raise ValueError(
@@ -129,7 +127,7 @@ def do_fulltext_search(
search = search.filter("range", year=dict(gte=2000))
elif query.filter_time == "before_1925":
search = search.filter("range", year=dict(lt=1925))
- elif query.filter_time == "all_time" or query.filter_time == None:
+ elif query.filter_time == "all_time" or query.filter_time is None:
pass
else:
raise ValueError(
@@ -141,7 +139,7 @@ def do_fulltext_search(
search = search.filter("term", tag="oa")
elif query.filter_availability == "everything":
pass
- elif query.filter_availability == "fulltext" or query.filter_availability == None:
+ elif query.filter_availability == "fulltext" or query.filter_availability is None:
search = search.filter("terms", access_type=["wayback", "ia_file", "ia_sim"])
else:
raise ValueError(
@@ -199,7 +197,7 @@ def do_fulltext_search(
search = search.sort("year", "date")
elif query.sort_order == "time_desc":
search = search.sort("-year", "-date")
- elif query.sort_order == "relevancy" or query.sort_order == None:
+ elif query.sort_order == "relevancy" or query.sort_order is None:
pass
else:
raise ValueError(f"Unknown 'sort_order' parameter value: '{query.sort_order}'")
@@ -211,7 +209,7 @@ def do_fulltext_search(
# Avoid deep paging problem.
offset = deep_page_limit
- search = search[offset : offset + limit]
+ search = search[offset:(offset+limit)]
try:
resp = search.execute()
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index b84ac47..cfc197f 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -1,24 +1,15 @@
-import os
import io
import sys
import sqlite3
import argparse
+from typing import List, Dict, Optional, Any
+
import requests
-from pydantic import BaseModel, validator
-from typing import List, Dict, Tuple, Optional, Any, Sequence
-from fatcat_openapi_client import ReleaseEntity, FileEntity
import internetarchive
-from fatcat_scholar.api_entities import *
from fatcat_scholar.djvu import djvu_extract_leaf_texts
-from fatcat_scholar.sandcrawler import (
- SandcrawlerPostgrestClient,
- SandcrawlerMinioClient,
-)
-from fatcat_scholar.issue_db import IssueDB, SimIssueRow
+from fatcat_scholar.issue_db import IssueDB
from fatcat_scholar.schema import (
- es_biblio_from_release,
- es_release_from_release,
DocType,
IntermediateBundle,
)
@@ -57,7 +48,7 @@ class SimPipeline:
def fetch_sim_issue(self, issue_db_row: Any) -> Optional[Any]:
"""
- issue_item
+ issue_item
pages: str
page_texts: list
raw_text
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index b5a0223..28c959b 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -1,19 +1,10 @@
-import os
-import io
import sys
import argparse
-from pydantic import BaseModel, validator
-from typing import List, Dict, Tuple, Optional, Any, Sequence
+from typing import List, Dict, Optional, Any
+
from fatcat_openapi_client import ReleaseEntity, FileEntity
-import internetarchive
from fatcat_scholar.api_entities import *
-from fatcat_scholar.djvu import djvu_extract_leaf_texts
-from fatcat_scholar.sandcrawler import (
- SandcrawlerPostgrestClient,
- SandcrawlerMinioClient,
-)
-from fatcat_scholar.issue_db import IssueDB, SimIssueRow
from fatcat_scholar.schema import *
from fatcat_scholar.grobid2json import teixml2json
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index 6c8a2e9..3f6982d 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -5,13 +5,11 @@ So far there are few endpoints, so we just put them all here!
"""
import sys
-from enum import Enum
import babel.support
-from fastapi import FastAPI, APIRouter, Request, Depends, Header
+from fastapi import FastAPI, APIRouter, Request, Depends
from fastapi.staticfiles import StaticFiles
-from fastapi.responses import HTMLResponse
from dynaconf import settings
-from typing import List, Dict, Tuple, Optional, Any, Sequence
+from typing import Optional
from fatcat_scholar.hacks import Jinja2Templates
from fatcat_scholar.search import do_fulltext_search, FulltextQuery, FulltextHits
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index af558a3..09ae02f 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -4,7 +4,6 @@ import sys
import minio
import requests
import argparse
-from pydantic import BaseModel, validator
from typing import List, Dict, Tuple, Optional, Any, Sequence
from fatcat_openapi_client import ReleaseEntity, FileEntity
import internetarchive
@@ -17,8 +16,6 @@ from fatcat_scholar.sandcrawler import (
)
from fatcat_scholar.issue_db import IssueDB, SimIssueRow, SimPubRow
from fatcat_scholar.schema import (
- es_biblio_from_release,
- es_release_from_release,
DocType,
IntermediateBundle,
)
@@ -171,7 +168,7 @@ class WorkPipeline:
release_ident: str,
) -> Optional[Any]:
"""
- issue_item
+ issue_item
pages: str
page_texts: list
page_num
@@ -214,7 +211,7 @@ class WorkPipeline:
# override 'close()' method so we can still read out contents
djvu_bytes = io.BytesIO()
djvu_bytes.close = lambda: None # type: ignore
- assert issue_item_djvu.download(fileobj=djvu_bytes) == True
+ assert issue_item_djvu.download(fileobj=djvu_bytes)
djvu_bytes.seek(0)
djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8"))
del djvu_bytes