diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html.py | 13 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 6 |
2 files changed, 14 insertions, 5 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 5fba963..50183be 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -2,7 +2,7 @@ import json import re import sys import urllib.parse -from typing import Dict +from typing import Any, Dict from bs4 import BeautifulSoup @@ -32,6 +32,11 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: print(f"{ule} (url={html_url})", file=sys.stderr) return dict() + # ignoring most type checks on bs4 output in this function (which is partially deprecated) + meta: Any + url: Any + redirect: Any + ### General Tricks ### # highwire-style meta tag @@ -99,7 +104,9 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # sciencedirect PDF URL extract # https://www.sciencedirect.com/science/article/pii/S0169204621000670 if "sciencedirect.com/science/article/pii/" in html_url and not html_url.endswith(".pdf"): - json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"}) + json_tag: Any = soup.find( + "script", attrs={"type": "application/json", "data-iso-key": "_0"} + ) url = None if json_tag: try: @@ -146,7 +153,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: if "://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber" in html_url: # HTML iframe like: # <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&arnumber=8730313&isnumber=8600701&ref=" frameborder="0"></iframe> - iframe = soup.find("iframe") + iframe: Any = soup.find("iframe") if iframe and ".pdf" in iframe["src"]: return dict(pdf_url=iframe["src"], technique="iframe") diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index f7954b1..c8c0c33 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -22,6 +22,8 @@ import os import xml.etree.ElementTree from typing import Any, Dict, List, Optional +import psycopg2 + from sandcrawler.db import SandcrawlerPostgresClient from sandcrawler.grobid import GrobidClient from sandcrawler.ingest_html import HtmlMetaRow @@ -358,7 +360,7 @@ class PersistGrobidWorker(SandcrawlerWorker): assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed" if not self.s3_only: self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url) - self.cur = self.db.conn.cursor() + self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor() else: self.db = None self.cur = None @@ -514,7 +516,7 @@ class PersistPdfTextWorker(SandcrawlerWorker): assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed" if not self.s3_only: self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url) - self.cur = self.db.conn.cursor() + self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor() else: self.db = None self.cur = None |