aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/html.py13
-rw-r--r--python/sandcrawler/persist.py6
2 files changed, 14 insertions, 5 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 5fba963..50183be 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -2,7 +2,7 @@ import json
import re
import sys
import urllib.parse
-from typing import Dict
+from typing import Any, Dict
from bs4 import BeautifulSoup
@@ -32,6 +32,11 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
print(f"{ule} (url={html_url})", file=sys.stderr)
return dict()
+ # ignoring most type checks on bs4 output in this function (which is partially deprecated)
+ meta: Any
+ url: Any
+ redirect: Any
+
### General Tricks ###
# highwire-style meta tag
@@ -99,7 +104,9 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
# sciencedirect PDF URL extract
# https://www.sciencedirect.com/science/article/pii/S0169204621000670
if "sciencedirect.com/science/article/pii/" in html_url and not html_url.endswith(".pdf"):
- json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"})
+ json_tag: Any = soup.find(
+ "script", attrs={"type": "application/json", "data-iso-key": "_0"}
+ )
url = None
if json_tag:
try:
@@ -146,7 +153,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
if "://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber" in html_url:
# HTML iframe like:
# <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&amp;arnumber=8730313&amp;isnumber=8600701&amp;ref=" frameborder="0"></iframe>
- iframe = soup.find("iframe")
+ iframe: Any = soup.find("iframe")
if iframe and ".pdf" in iframe["src"]:
return dict(pdf_url=iframe["src"], technique="iframe")
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index f7954b1..c8c0c33 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -22,6 +22,8 @@ import os
import xml.etree.ElementTree
from typing import Any, Dict, List, Optional
+import psycopg2
+
from sandcrawler.db import SandcrawlerPostgresClient
from sandcrawler.grobid import GrobidClient
from sandcrawler.ingest_html import HtmlMetaRow
@@ -358,7 +360,7 @@ class PersistGrobidWorker(SandcrawlerWorker):
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
if not self.s3_only:
self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
else:
self.db = None
self.cur = None
@@ -514,7 +516,7 @@ class PersistPdfTextWorker(SandcrawlerWorker):
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
if not self.s3_only:
self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
else:
self.db = None
self.cur = None