From d5aa7960d40261fa0a10be4ad6fbdbfc9cc4b3af Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 30 Oct 2020 15:18:07 -0700 Subject: cdx datetime parsing improvements --- python/sandcrawler/misc.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index b078d6c..8c91246 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -155,6 +155,8 @@ def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]: ) def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]: + if not dt_str: + return None try: return datetime.datetime.strptime(dt_str, "%Y%m%d%H%M%S") except Exception: @@ -164,7 +166,16 @@ def test_parse_cdx_datetime() -> None: assert parse_cdx_datetime("") == None assert parse_cdx_datetime("asdf") == None assert parse_cdx_datetime("19930203123045") != None + assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3) + +def datetime_to_cdx(dt: datetime.datetime) -> str: + return '%04d%02d%02d%02d%02d%02d' % ( + dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second, + ) +def test_datetime_to_cdx() -> None: + assert "20201028235103" == datetime_to_cdx(datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)) def requests_retry_session(retries=10, backoff_factor=3, status_forcelist=(500, 502, 504), session=None) -> requests.Session: -- cgit v1.2.3