aboutsummaryrefslogtreecommitdiffstats
path: root/chocula
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-02 17:00:36 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-02 17:00:36 -0700
commit3c5775eea38fd062a9d111db6c5fc2f6fa002b2e (patch)
tree7fcf838f0773b9e1911bef90b79c54284afad957 /chocula
parent0b9f64d2b4409eefbd8b20a9a5eb2e1e2b7650fd (diff)
downloadchocula-3c5775eea38fd062a9d111db6c5fc2f6fa002b2e.tar.gz
chocula-3c5775eea38fd062a9d111db6c5fc2f6fa002b2e.zip
fixes for KBART import
Diffstat (limited to 'chocula')
-rw-r--r--chocula/common.py20
-rw-r--r--chocula/util.py4
2 files changed, 16 insertions, 8 deletions
diff --git a/chocula/common.py b/chocula/common.py
index 3c8761f..a5b3739 100644
--- a/chocula/common.py
+++ b/chocula/common.py
@@ -13,6 +13,8 @@ from chocula.config import ChoculaConfig
from chocula.database import DirectoryInfo, IssnDatabase, HomepageUrl
+# Portico files have weirdly large field sizes
+csv.field_size_limit(1310720)
THIS_YEAR = datetime.date.today().year
class DirectoryLoader():
@@ -77,8 +79,8 @@ class KbartLoader():
def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]:
- issne: Optional[str] = clean_issn(row['online_identifier'])
- issnp: Optional[str] = clean_issn(row['print_identifier'])
+ issne: Optional[str] = clean_issn(row['online_identifier'] or "")
+ issnp: Optional[str] = clean_issn(row['print_identifier'] or "")
issnl: Optional[str] = None
if issne:
issnl = issn_db.issn2issnl(issne)
@@ -92,7 +94,7 @@ class KbartLoader():
end_year = int(row['date_last_issue_online'][:4])
end_volume = row['num_last_vol_online']
# hack to handle open-ended preservation
- if end_year is None and '(present)' in end_volume:
+ if end_year is None and end_volume and '(present)' in end_volume:
end_year = THIS_YEAR
record = KbartRecord(
issnl=issnl,
@@ -101,13 +103,17 @@ class KbartLoader():
title=clean_str(row['publication_title']),
publisher=clean_str(row['publisher_name']),
url=HomepageUrl.from_url(row['title_url']),
- embargo=row['embargo_info'] or None,
+ embargo=clean_str(row['embargo_info']),
start_year=start_year,
end_year=end_year,
- start_volume=row['num_first_vol_online'],
- end_volume=row['num_last_vol_online'],
+ start_volume=clean_str(row['num_first_vol_online']),
+ end_volume=clean_str(row['num_last_vol_online']),
year_spans=[],
)
+ if record.start_volume == 'null':
+ record.start_volume = None
+ if record.end_volume == 'null':
+ record.end_volume = None
return record
def index_file(self, db) -> Counter:
@@ -130,7 +136,7 @@ class KbartLoader():
counts['skip-issnl'] += 1
continue
elif record.start_year is None or record.end_year is None:
- counts['missing-years'] += 1
+ counts['partial-missing-years'] += 1
counts['parsed'] += 1
existing = kbart_dict.get(record.issnl, record)
diff --git a/chocula/util.py b/chocula/util.py
index 09860d8..4080e5b 100644
--- a/chocula/util.py
+++ b/chocula/util.py
@@ -1,4 +1,5 @@
+import sys
from dataclasses import dataclass
from typing import Dict, Optional
@@ -171,7 +172,8 @@ def gaps_to_spans(first, last, gaps):
print("mangled years: {}".format((first, last, gaps)), file=sys.stderr)
full = list(range(first, last+1))
for missing in gaps:
- full.remove(missing)
+ if missing in full:
+ full.remove(missing)
spans = []
low = None
last = None