From c4cf72914560f92e914a5dbf7360637f6c24f323 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Oct 2020 15:52:54 -0700 Subject: HTML metadata: fix type warnings --- python/sandcrawler/html_metadata.py | 4 +++- python/tests/test_html_metadata.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 71715c2..a9536a6 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -219,7 +219,9 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]: raw_date = meta.pop('raw_date', None) if raw_date: - meta['release_date'] = dateparser.parse(raw_date).date() + parsed = dateparser.parse(raw_date) + if parsed: + meta['release_date'] = parsed.date() raw_release_type = meta.pop('raw_release_type', None) if raw_release_type: diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py index 4154aa5..4d670e5 100644 --- a/python/tests/test_html_metadata.py +++ b/python/tests/test_html_metadata.py @@ -36,7 +36,7 @@ def test_html_metadata_plos() -> None: assert meta.volume == "14" assert meta.container_issn == "1932-6203" assert meta.publisher == "Public Library of Science" - assert "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references + assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references assert meta.release_type == "article-journal" @@ -134,4 +134,5 @@ def test_html_metadata_dc_case() -> None: """ meta = html_extract_biblio(HTMLParser(snippet)) + assert meta is not None assert meta.issue == "123" -- cgit v1.2.3