diff options
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/djvu.py | 2 | ||||
-rwxr-xr-x | fatcat_scholar/grobid2json.py | 4 | ||||
-rw-r--r-- | fatcat_scholar/issue_db.py | 4 | ||||
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 1 | ||||
-rw-r--r-- | fatcat_scholar/schema.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 14 | ||||
-rw-r--r-- | fatcat_scholar/sim_pipeline.py | 17 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 13 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 6 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 7 |
10 files changed, 22 insertions, 48 deletions
diff --git a/fatcat_scholar/djvu.py b/fatcat_scholar/djvu.py index ca3e412..c715608 100644 --- a/fatcat_scholar/djvu.py +++ b/fatcat_scholar/djvu.py @@ -1,5 +1,5 @@ from io import StringIO -from typing import List, Dict, Tuple, Optional, Any, Sequence +from typing import List, Dict, Optional import xml.etree.ElementTree as ET diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index 57d039e..4019363 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -108,7 +108,7 @@ def biblio_info(elem): if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) - ref["date"] = (date != None) and date.attrib.get("when") + ref["date"] = (date is not None) and date.attrib.get("when") ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) el = elem.find(".//{%s}ptr[@target]" % ns) @@ -148,7 +148,7 @@ def teixml2json(content, encumbered=True): ) info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) - info["date"] = (date != None) and date.attrib.get("when") + info["date"] = (date is not None) and date.attrib.get("when") info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) if info["doi"]: diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py index 12ffa32..829560b 100644 --- a/fatcat_scholar/issue_db.py +++ b/fatcat_scholar/issue_db.py @@ -3,10 +3,10 @@ import json import sqlite3 import argparse from dataclasses import dataclass -from typing import List, Dict, Tuple, Optional, Any, Sequence +from typing import List, Dict, Optional, Any, Sequence import fatcat_openapi_client import elasticsearch -from elasticsearch_dsl import Search, Q +from elasticsearch_dsl import Search @dataclass diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 408682f..347364f 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -1,4 +1,3 @@ -import json import minio import requests from typing import Dict, Optional, Any diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 110991d..29bbe92 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -253,7 +253,7 @@ def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]: d = dict() for abst in release.abstracts: - if not abst.lang in d: + if abst.lang not in d: d[abst.lang] = ScholarAbstract( lang_code=abst.lang, body=scrub_text(abst.content) ) diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 5a61f53..3d9ca9b 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -3,15 +3,13 @@ Helpers to make elasticsearch queries. """ import sys -import json from gettext import gettext import datetime import elasticsearch from pydantic import BaseModel from dynaconf import settings -from dataclasses import dataclass from elasticsearch_dsl import Search, Q -from typing import List, Dict, Tuple, Optional, Any, Sequence +from typing import List, Optional, Any # i18n note: the use of gettext below doesn't actually do the translation here, # it just ensures that the strings are caught by babel for translation later @@ -106,7 +104,7 @@ def do_fulltext_search( search = search.filter("terms", type=["report", "standard",]) elif query.filter_type == "datasets": search = search.filter("terms", type=["dataset", "software",]) - elif query.filter_type == "everything" or query.filter_type == None: + elif query.filter_type == "everything" or query.filter_type is None: pass else: raise ValueError( @@ -129,7 +127,7 @@ def do_fulltext_search( search = search.filter("range", year=dict(gte=2000)) elif query.filter_time == "before_1925": search = search.filter("range", year=dict(lt=1925)) - elif query.filter_time == "all_time" or query.filter_time == None: + elif query.filter_time == "all_time" or query.filter_time is None: pass else: raise ValueError( @@ -141,7 +139,7 @@ def do_fulltext_search( search = search.filter("term", tag="oa") elif query.filter_availability == "everything": pass - elif query.filter_availability == "fulltext" or query.filter_availability == None: + elif query.filter_availability == "fulltext" or query.filter_availability is None: search = search.filter("terms", access_type=["wayback", "ia_file", "ia_sim"]) else: raise ValueError( @@ -199,7 +197,7 @@ def do_fulltext_search( search = search.sort("year", "date") elif query.sort_order == "time_desc": search = search.sort("-year", "-date") - elif query.sort_order == "relevancy" or query.sort_order == None: + elif query.sort_order == "relevancy" or query.sort_order is None: pass else: raise ValueError(f"Unknown 'sort_order' parameter value: '{query.sort_order}'") @@ -211,7 +209,7 @@ def do_fulltext_search( # Avoid deep paging problem. offset = deep_page_limit - search = search[offset : offset + limit] + search = search[offset:(offset+limit)] try: resp = search.execute() diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index b84ac47..cfc197f 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -1,24 +1,15 @@ -import os import io import sys import sqlite3 import argparse +from typing import List, Dict, Optional, Any + import requests -from pydantic import BaseModel, validator -from typing import List, Dict, Tuple, Optional, Any, Sequence -from fatcat_openapi_client import ReleaseEntity, FileEntity import internetarchive -from fatcat_scholar.api_entities import * from fatcat_scholar.djvu import djvu_extract_leaf_texts -from fatcat_scholar.sandcrawler import ( - SandcrawlerPostgrestClient, - SandcrawlerMinioClient, -) -from fatcat_scholar.issue_db import IssueDB, SimIssueRow +from fatcat_scholar.issue_db import IssueDB from fatcat_scholar.schema import ( - es_biblio_from_release, - es_release_from_release, DocType, IntermediateBundle, ) @@ -57,7 +48,7 @@ class SimPipeline: def fetch_sim_issue(self, issue_db_row: Any) -> Optional[Any]: """ - issue_item + issue_item pages: str page_texts: list raw_text diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index b5a0223..28c959b 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -1,19 +1,10 @@ -import os -import io import sys import argparse -from pydantic import BaseModel, validator -from typing import List, Dict, Tuple, Optional, Any, Sequence +from typing import List, Dict, Optional, Any + from fatcat_openapi_client import ReleaseEntity, FileEntity -import internetarchive from fatcat_scholar.api_entities import * -from fatcat_scholar.djvu import djvu_extract_leaf_texts -from fatcat_scholar.sandcrawler import ( - SandcrawlerPostgrestClient, - SandcrawlerMinioClient, -) -from fatcat_scholar.issue_db import IssueDB, SimIssueRow from fatcat_scholar.schema import * from fatcat_scholar.grobid2json import teixml2json diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 6c8a2e9..3f6982d 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -5,13 +5,11 @@ So far there are few endpoints, so we just put them all here! """ import sys -from enum import Enum import babel.support -from fastapi import FastAPI, APIRouter, Request, Depends, Header +from fastapi import FastAPI, APIRouter, Request, Depends from fastapi.staticfiles import StaticFiles -from fastapi.responses import HTMLResponse from dynaconf import settings -from typing import List, Dict, Tuple, Optional, Any, Sequence +from typing import Optional from fatcat_scholar.hacks import Jinja2Templates from fatcat_scholar.search import do_fulltext_search, FulltextQuery, FulltextHits diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index af558a3..09ae02f 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -4,7 +4,6 @@ import sys import minio import requests import argparse -from pydantic import BaseModel, validator from typing import List, Dict, Tuple, Optional, Any, Sequence from fatcat_openapi_client import ReleaseEntity, FileEntity import internetarchive @@ -17,8 +16,6 @@ from fatcat_scholar.sandcrawler import ( ) from fatcat_scholar.issue_db import IssueDB, SimIssueRow, SimPubRow from fatcat_scholar.schema import ( - es_biblio_from_release, - es_release_from_release, DocType, IntermediateBundle, ) @@ -171,7 +168,7 @@ class WorkPipeline: release_ident: str, ) -> Optional[Any]: """ - issue_item + issue_item pages: str page_texts: list page_num @@ -214,7 +211,7 @@ class WorkPipeline: # override 'close()' method so we can still read out contents djvu_bytes = io.BytesIO() djvu_bytes.close = lambda: None # type: ignore - assert issue_item_djvu.download(fileobj=djvu_bytes) == True + assert issue_item_djvu.download(fileobj=djvu_bytes) djvu_bytes.seek(0) djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8")) del djvu_bytes |