aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-27 15:52:54 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-27 15:52:54 -0700
commitc4cf72914560f92e914a5dbf7360637f6c24f323 (patch)
treeb3127216f69c5bc52d994a6b6ddf084e7971ad1c
parent58f89d645063415bb9e1d36102cbf4dfc45cffda (diff)
downloadsandcrawler-c4cf72914560f92e914a5dbf7360637f6c24f323.tar.gz
sandcrawler-c4cf72914560f92e914a5dbf7360637f6c24f323.zip
HTML metadata: fix type warnings
-rw-r--r--python/sandcrawler/html_metadata.py4
-rw-r--r--python/tests/test_html_metadata.py3
2 files changed, 5 insertions, 2 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 71715c2..a9536a6 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -219,7 +219,9 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]:
raw_date = meta.pop('raw_date', None)
if raw_date:
- meta['release_date'] = dateparser.parse(raw_date).date()
+ parsed = dateparser.parse(raw_date)
+ if parsed:
+ meta['release_date'] = parsed.date()
raw_release_type = meta.pop('raw_release_type', None)
if raw_release_type:
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index 4154aa5..4d670e5 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -36,7 +36,7 @@ def test_html_metadata_plos() -> None:
assert meta.volume == "14"
assert meta.container_issn == "1932-6203"
assert meta.publisher == "Public Library of Science"
- assert "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
+ assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
assert meta.release_type == "article-journal"
@@ -134,4 +134,5 @@ def test_html_metadata_dc_case() -> None:
</html>"""
meta = html_extract_biblio(HTMLParser(snippet))
+ assert meta is not None
assert meta.issue == "123"