aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
commit31d1a6a713d177990609767d508209ced19ca396 (patch)
treea628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers
parent9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
downloadfatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz
fatcat-31d1a6a713d177990609767d508209ced19ca396.zip
fmt (black): fatcat_tools/
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/arabesque.py113
-rw-r--r--python/fatcat_tools/importers/arxiv.py210
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py119
-rw-r--r--python/fatcat_tools/importers/chocula.py149
-rw-r--r--python/fatcat_tools/importers/common.py198
-rw-r--r--python/fatcat_tools/importers/crossref.py413
-rw-r--r--python/fatcat_tools/importers/datacite.py824
-rw-r--r--python/fatcat_tools/importers/dblp_container.py81
-rw-r--r--python/fatcat_tools/importers/dblp_release.py257
-rw-r--r--python/fatcat_tools/importers/doaj_article.py178
-rw-r--r--python/fatcat_tools/importers/file_meta.py36
-rw-r--r--python/fatcat_tools/importers/fileset_generic.py55
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py136
-rw-r--r--python/fatcat_tools/importers/ingest.py693
-rw-r--r--python/fatcat_tools/importers/jalc.py193
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py111
-rw-r--r--python/fatcat_tools/importers/jstor.py140
-rw-r--r--python/fatcat_tools/importers/matched.py103
-rw-r--r--python/fatcat_tools/importers/orcid.py50
-rw-r--r--python/fatcat_tools/importers/pubmed.py355
-rw-r--r--python/fatcat_tools/importers/shadow.py113
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py166
22 files changed, 2578 insertions, 2115 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index 2b0ff7ec..ae4f9049 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -1,9 +1,9 @@
-
import fatcat_openapi_client
from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
-ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL'
+ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
+
class ArabesqueMatchImporter(EntityImporter):
"""
@@ -38,17 +38,17 @@ class ArabesqueMatchImporter(EntityImporter):
def __init__(self, api, extid_type, require_grobid=True, **kwargs):
- eg_desc = kwargs.get('editgroup_description', None) or "Match web crawl files to releases based on identifier/URL seedlist"
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter')
- if kwargs.get('crawl_id'):
- eg_extra['crawl_id'] = kwargs.get('crawl_id')
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
- assert extid_type in ('doi', 'pmcid', 'pmid')
+ eg_desc = (
+ kwargs.get("editgroup_description", None)
+ or "Match web crawl files to releases based on identifier/URL seedlist"
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArabesqueMatchImporter")
+ if kwargs.get("crawl_id"):
+ eg_extra["crawl_id"] = kwargs.get("crawl_id")
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
+ assert extid_type in ("doi", "pmcid", "pmid")
self.extid_type = extid_type
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
@@ -60,33 +60,35 @@ class ArabesqueMatchImporter(EntityImporter):
print("NOT checking GROBID status column")
def want(self, row):
- if self.require_grobid and not row['postproc_status'] == "200":
+ if self.require_grobid and not row["postproc_status"] == "200":
return False
- if (bool(row['hit']) is True
- and row['final_sha1']
- and row['final_timestamp']
- and row['final_timestamp'] != "-"
- and len(row['final_timestamp']) == 14
- and row['final_mimetype']
- and bool(row['hit']) is True
- and row['identifier']):
+ if (
+ bool(row["hit"]) is True
+ and row["final_sha1"]
+ and row["final_timestamp"]
+ and row["final_timestamp"] != "-"
+ and len(row["final_timestamp"]) == 14
+ and row["final_mimetype"]
+ and bool(row["hit"]) is True
+ and row["identifier"]
+ ):
return True
else:
return False
def parse_record(self, row):
- extid = row['identifier'].strip()
+ extid = row["identifier"].strip()
# check/cleanup DOI
- if self.extid_type == 'doi':
+ if self.extid_type == "doi":
extid = extid.lower()
- extid.replace('http://doi.org/', '')
- extid.replace('https://doi.org/', '')
- if extid.startswith('doi:'):
+ extid.replace("http://doi.org/", "")
+ extid.replace("https://doi.org/", "")
+ if extid.startswith("doi:"):
extid = extid[4:]
- if not extid.startswith('10.'):
- self.counts['skip-extid-invalid']
+ if not extid.startswith("10."):
+ self.counts["skip-extid-invalid"]
return None
# lookup extid
@@ -95,35 +97,35 @@ class ArabesqueMatchImporter(EntityImporter):
except fatcat_openapi_client.rest.ApiException as err:
if err.status == 404:
# bail on 404 (release not in DB)
- self.counts['skip-extid-not-found'] += 1
+ self.counts["skip-extid-not-found"] += 1
return None
elif err.status == 400:
- self.counts['skip-extid-invalid'] += 1
+ self.counts["skip-extid-invalid"] += 1
return None
else:
raise err
- url = make_rel_url(row['final_url'], self.default_link_rel)
+ url = make_rel_url(row["final_url"], self.default_link_rel)
if not url:
- self.counts['skip-url'] += 1
+ self.counts["skip-url"] += 1
return None
- if not row['final_timestamp']:
- self.counts['skip-missing-timestamp'] += 1
+ if not row["final_timestamp"]:
+ self.counts["skip-missing-timestamp"] += 1
return None
wayback = "https://web.archive.org/web/{}/{}".format(
- row['final_timestamp'],
- row['final_url'])
+ row["final_timestamp"], row["final_url"]
+ )
urls = [url, ("webarchive", wayback)]
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
if len(urls) > SANE_MAX_URLS:
- self.counts['skip-too-many-url'] += 1
+ self.counts["skip-too-many-url"] += 1
return None
fe = fatcat_openapi_client.FileEntity(
- sha1=b32_hex(row['final_sha1']),
- mimetype=row['final_mimetype'] or self.default_mimetype,
+ sha1=b32_hex(row["final_sha1"]),
+ mimetype=row["final_mimetype"] or self.default_mimetype,
release_ids=[re.ident],
urls=urls,
)
@@ -143,15 +145,15 @@ class ArabesqueMatchImporter(EntityImporter):
if (fe.release_ids[0] in existing.release_ids) and existing.urls:
# TODO: could still, in theory update with the new URL?
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if not self.do_updates:
- self.counts['skip-update-disabled'] += 1
+ self.counts["skip-update-disabled"] += 1
return False
if existing.ident in [e.ident for e in self._edits_inflight]:
- self.counts['skip-update-inflight'] += 1
+ self.counts["skip-update-inflight"] += 1
return False
# TODO: this code path never gets hit because of the check above
@@ -159,28 +161,33 @@ class ArabesqueMatchImporter(EntityImporter):
existing_urls = set([u.url for u in existing.urls])
new_urls = set([u.url for u in fe.urls])
if existing_urls.issuperset(new_urls):
- self.counts['skip-update-nothing-new'] += 1
+ self.counts["skip-update-nothing-new"] += 1
return False
# merge the existing into this one and update
existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
- existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+ existing.urls = [
+ fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls
+ ]
if len(existing.urls) > SANE_MAX_URLS:
- self.counts['skip-update-too-many-url'] += 1
+ self.counts["skip-update-too-many-url"] += 1
return None
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
if len(existing.release_ids) > SANE_MAX_RELEASES:
- self.counts['skip-update-too-many-url'] += 1
+ self.counts["skip-update-too-many-url"] += 1
return None
existing.mimetype = existing.mimetype or fe.mimetype
edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
self._edits_inflight.append(edit)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
def insert_batch(self, batch):
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index fc429fb0..7a689ed2 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -1,4 +1,3 @@
-
import datetime
import json
import re
@@ -13,6 +12,7 @@ from .crossref import lookup_license_slug
latex2text = LatexNodes2Text()
+
def latex_to_text(raw):
try:
return latex2text.latex_to_text(raw).strip()
@@ -21,13 +21,14 @@ def latex_to_text(raw):
except IndexError:
return raw.strip()
+
def parse_arxiv_authors(raw):
if not raw:
return []
- raw = raw.replace('*', '')
- if '(' in raw:
- raw = re.sub(r'\(.*\)', '', raw)
- authors = raw.split(', ')
+ raw = raw.replace("*", "")
+ if "(" in raw:
+ raw = re.sub(r"\(.*\)", "", raw)
+ authors = raw.split(", ")
if authors:
last = authors[-1].split(" and ")
if len(last) == 2:
@@ -39,9 +40,12 @@ def parse_arxiv_authors(raw):
authors = [a for a in authors if a]
return authors
+
def test_parse_arxiv_authors():
- assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
+ assert parse_arxiv_authors(
+ "Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an"
+ ) == [
"Raphael Chetrite",
"Shamik Gupta",
"Izaak Neri",
@@ -63,7 +67,9 @@ def test_parse_arxiv_authors():
"Raphael Chetrite Shamik Gupta",
]
- assert parse_arxiv_authors("B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)") == [
+ assert parse_arxiv_authors(
+ "B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)"
+ ) == [
"B. P. Lanyon",
"T. J. Weinhold",
"N. K. Langford",
@@ -84,17 +90,21 @@ class ArxivRawImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of arxiv metadata via arXivRaw OAI-PMH feed")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter')
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of arxiv metadata via arXivRaw OAI-PMH feed",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArxivRawImporter")
# lower batch size, because multiple versions per entry (guessing 2-3 on average?)
- batch_size = kwargs.get('edit_batch_size', 50)
- super().__init__(api,
+ batch_size = kwargs.get("edit_batch_size", 50)
+ super().__init__(
+ api,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
batch_size=batch_size,
- **kwargs)
+ **kwargs
+ )
self._test_override = False
def parse_record(self, record):
@@ -114,53 +124,56 @@ class ArxivRawImporter(EntityImporter):
doi = None
if metadata.doi and metadata.doi.string:
doi = metadata.doi.string.lower().split()[0].strip()
- if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]):
+ if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
sys.stderr.write("BOGUS DOI: {}\n".format(doi))
doi = None
- title = latex_to_text(metadata.title.get_text().replace('\n', ' '))
- authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' '))
- contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)]
-
- lang = "en" # the vast majority in english
+ title = latex_to_text(metadata.title.get_text().replace("\n", " "))
+ authors = parse_arxiv_authors(metadata.authors.get_text().replace("\n", " "))
+ contribs = [
+ fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role="author")
+ for i, a in enumerate(authors)
+ ]
+
+ lang = "en" # the vast majority in english
if metadata.comments and metadata.comments.get_text():
- comments = metadata.comments.get_text().replace('\n', ' ').strip()
- extra_arxiv['comments'] = comments
- if 'in french' in comments.lower():
- lang = 'fr'
- elif 'in spanish' in comments.lower():
- lang = 'es'
- elif 'in portuguese' in comments.lower():
- lang = 'pt'
- elif 'in hindi' in comments.lower():
- lang = 'hi'
- elif 'in japanese' in comments.lower():
- lang = 'ja'
- elif 'in german' in comments.lower():
- lang = 'de'
- elif 'simplified chinese' in comments.lower():
- lang = 'zh'
- elif 'in russian' in comments.lower():
- lang = 'ru'
+ comments = metadata.comments.get_text().replace("\n", " ").strip()
+ extra_arxiv["comments"] = comments
+ if "in french" in comments.lower():
+ lang = "fr"
+ elif "in spanish" in comments.lower():
+ lang = "es"
+ elif "in portuguese" in comments.lower():
+ lang = "pt"
+ elif "in hindi" in comments.lower():
+ lang = "hi"
+ elif "in japanese" in comments.lower():
+ lang = "ja"
+ elif "in german" in comments.lower():
+ lang = "de"
+ elif "simplified chinese" in comments.lower():
+ lang = "zh"
+ elif "in russian" in comments.lower():
+ lang = "ru"
# more languages?
number = None
- if metadata.find('journal-ref') and metadata.find('journal-ref').get_text():
- journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip()
- extra_arxiv['journal_ref'] = journal_ref
+ if metadata.find("journal-ref") and metadata.find("journal-ref").get_text():
+ journal_ref = metadata.find("journal-ref").get_text().replace("\n", " ").strip()
+ extra_arxiv["journal_ref"] = journal_ref
if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
release_type = "paper-conference"
- if metadata.find('report-no') and metadata.find('report-no').string:
- number = metadata.find('report-no').string.strip()
+ if metadata.find("report-no") and metadata.find("report-no").string:
+ number = metadata.find("report-no").string.strip()
# at least some people plop extra metadata in here. hrmf!
- if 'ISSN ' in number or 'ISBN ' in number or len(number.split()) > 2:
- extra_arxiv['report-no'] = number
+ if "ISSN " in number or "ISBN " in number or len(number.split()) > 2:
+ extra_arxiv["report-no"] = number
number = None
else:
release_type = "report"
- if metadata.find('acm-class') and metadata.find('acm-class').string:
- extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip()
+ if metadata.find("acm-class") and metadata.find("acm-class").string:
+ extra_arxiv["acm_class"] = metadata.find("acm-class").string.strip()
if metadata.categories and metadata.categories.get_text():
- extra_arxiv['categories'] = metadata.categories.get_text().split()
+ extra_arxiv["categories"] = metadata.categories.get_text().split()
license_slug = None
if metadata.license and metadata.license.get_text():
license_slug = lookup_license_slug(metadata.license.get_text())
@@ -170,21 +183,29 @@ class ArxivRawImporter(EntityImporter):
abstracts = []
abst = metadata.abstract.get_text().strip()
orig = None
- if '-----' in abst:
- both = abst.split('-----')
+ if "-----" in abst:
+ both = abst.split("-----")
abst = both[0].strip()
orig = both[1].strip()
- if '$' in abst or '{' in abst:
+ if "$" in abst or "{" in abst:
mime = "application/x-latex"
abst_plain = latex_to_text(abst)
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en"))
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(
+ content=abst_plain, mimetype="text/plain", lang="en"
+ )
+ )
else:
mime = "text/plain"
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en"))
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")
+ )
if orig:
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime))
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)
+ )
# indicates that fulltext probably isn't english either
- if lang == 'en':
+ if lang == "en":
lang = None
# extra:
@@ -195,39 +216,43 @@ class ArxivRawImporter(EntityImporter):
# container_name
# group-title
# arxiv: comments, categories, etc
- extra_arxiv['base_id'] = base_id
- extra['superceded'] = True
- extra['arxiv'] = extra_arxiv
+ extra_arxiv["base_id"] = base_id
+ extra["superceded"] = True
+ extra["arxiv"] = extra_arxiv
versions = []
- for version in metadata.find_all('version'):
- arxiv_id = base_id + version['version']
+ for version in metadata.find_all("version"):
+ arxiv_id = base_id + version["version"]
release_date = version.date.string.strip()
- release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
+ release_date = datetime.datetime.strptime(
+ release_date, "%a, %d %b %Y %H:%M:%S %Z"
+ ).date()
# TODO: source_type?
- versions.append(fatcat_openapi_client.ReleaseEntity(
- work_id=None,
- title=title,
- #original_title
- version=version['version'],
- release_type=release_type,
- release_stage='submitted',
- release_date=release_date.isoformat(),
- release_year=release_date.year,
- ext_ids=fatcat_openapi_client.ReleaseExtIds(
- arxiv=arxiv_id,
- ),
- number=number,
- language=lang,
- license_slug=license_slug,
- abstracts=abstracts,
- contribs=contribs,
- extra=extra.copy(),
- ))
+ versions.append(
+ fatcat_openapi_client.ReleaseEntity(
+ work_id=None,
+ title=title,
+ # original_title
+ version=version["version"],
+ release_type=release_type,
+ release_stage="submitted",
+ release_date=release_date.isoformat(),
+ release_year=release_date.year,
+ ext_ids=fatcat_openapi_client.ReleaseExtIds(
+ arxiv=arxiv_id,
+ ),
+ number=number,
+ language=lang,
+ license_slug=license_slug,
+ abstracts=abstracts,
+ contribs=contribs,
+ extra=extra.copy(),
+ )
+ )
# TODO: assert that versions are actually in order?
assert versions
- versions[-1].extra.pop('superceded')
+ versions[-1].extra.pop("superceded")
# only apply DOI to most recent version (HACK)
if doi:
@@ -306,7 +331,7 @@ class ArxivRawImporter(EntityImporter):
for v in versions:
if v._existing_work_id:
if not v._updated:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
continue
if not any_work_id and last_edit:
# fetch the last inserted release from this group
@@ -315,7 +340,7 @@ class ArxivRawImporter(EntityImporter):
any_work_id = r.work_id
v.work_id = any_work_id
last_edit = self.api.create_release(self.get_editgroup_id(), v)
- self.counts['insert'] += 1
+ self.counts["insert"] += 1
return False
@@ -323,12 +348,15 @@ class ArxivRawImporter(EntityImporter):
# there is no batch/bezerk mode for arxiv importer, except for testing
if self._test_override:
for batch in batch_batch:
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
- self.counts['insert'] += len(batch) - 1
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
+ self.counts["insert"] += len(batch) - 1
else:
raise NotImplementedError()
@@ -341,9 +369,9 @@ class ArxivRawImporter(EntityImporter):
for article in soup.find_all("record"):
resp = self.parse_record(article)
print(json.dumps(resp))
- #sys.exit(-1)
+ # sys.exit(-1)
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = ArxivRawImporter(None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
index 0340f6a3..e9de42fc 100755
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -34,15 +34,15 @@ def single_file(prefix, path):
hashlib.sha1(),
hashlib.sha256(),
]
- with open(full, 'rb') as fp:
+ with open(full, "rb") as fp:
while True:
- data = fp.read(2**20)
+ data = fp.read(2 ** 20)
if not data:
break
for h in hashes:
h.update(data)
mime = magic.Magic(mime=True).from_file(full)
- if mime == 'application/octet-stream':
+ if mime == "application/octet-stream":
# magic apparently isn't that great; try using filename as well
guess = mimetypes.guess_type(full)[0]
if guess:
@@ -54,9 +54,11 @@ def single_file(prefix, path):
md5=hashes[0].hexdigest(),
sha1=hashes[1].hexdigest(),
sha256=hashes[2].hexdigest(),
- extra=dict(mimetype=mime))
+ extra=dict(mimetype=mime),
+ )
return fsf
+
def make_manifest(base_dir):
manifest = []
for root, dirs, files in os.walk(base_dir):
@@ -70,47 +72,49 @@ def cdl_dash_release(meta, extra=None):
if not extra:
extra = dict()
- assert meta['identifier']['type'] == 'DOI'
- doi = meta['identifier']['value'].lower()
- assert doi.startswith('10.')
+ assert meta["identifier"]["type"] == "DOI"
+ doi = meta["identifier"]["value"].lower()
+ assert doi.startswith("10.")
ark_id = None
- for extid in meta.get('alternativeIdentifiers', []):
- if extid['value'].startswith('ark:'):
- ark_id = extid['value']
+ for extid in meta.get("alternativeIdentifiers", []):
+ if extid["value"].startswith("ark:"):
+ ark_id = extid["value"]
assert ark_id
- license_slug = lookup_license_slug(meta['rights']['uri'])
+ license_slug = lookup_license_slug(meta["rights"]["uri"])
abstracts = []
- for desc in meta['descriptions']:
- if desc['type'] == "abstract":
- abstracts.append(ReleaseAbstract(
- mimetype="text/html",
- content=clean(desc['value'])))
- #print(abstracts)
+ for desc in meta["descriptions"]:
+ if desc["type"] == "abstract":
+ abstracts.append(
+ ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
+ )
+ # print(abstracts)
if not abstracts:
abstracts = None
contribs = []
- for creator in meta['creator']:
- contribs.append(ReleaseContrib(
- given_name=creator['given'],
- surname=creator['family'],
- # sorry everybody
- raw_name="{} {}".format(creator['given'], creator['family']),
- raw_affiliation=creator.get('affiliation'),
- role="author", # presumably, for these datasets?
- ))
+ for creator in meta["creator"]:
+ contribs.append(
+ ReleaseContrib(
+ given_name=creator["given"],
+ surname=creator["family"],
+ # sorry everybody
+ raw_name="{} {}".format(creator["given"], creator["family"]),
+ raw_affiliation=creator.get("affiliation"),
+ role="author", # presumably, for these datasets?
+ )
+ )
r = ReleaseEntity(
ext_ids=ReleaseExtIds(
doi=doi,
ark=ark_id,
),
- title=clean(meta['title'], force_xml=True),
- publisher=clean(meta['publisher']),
- release_year=int(meta['publicationYear']),
+ title=clean(meta["title"], force_xml=True),
+ publisher=clean(meta["publisher"]),
+ release_year=int(meta["publicationYear"]),
release_type="dataset",
license_slug=license_slug,
contribs=contribs,
@@ -119,66 +123,66 @@ def cdl_dash_release(meta, extra=None):
)
return r
+
def make_release_fileset(dat_path):
- if dat_path.endswith('/'):
+ if dat_path.endswith("/"):
dat_path = dat_path[:-1]
dat_discovery = dat_path
extra = dict()
assert len(dat_discovery) == 64
- with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp:
+ with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
meta_dict = json.loads(fp.read())
release = cdl_dash_release(meta_dict)
- ark_id = release.extra['ark_id']
+ ark_id = release.extra["ark_id"]
dash_version = None
# really crude XML parse-out
- with open(dat_path + "/stash-wrapper.xml", 'r') as fp:
+ with open(dat_path + "/stash-wrapper.xml", "r") as fp:
for line in fp:
line = line.strip()
if line.startswith("<st:version_number>"):
- dash_version = int(line[19:].split('<')[0])
+ dash_version = int(line[19:].split("<")[0])
assert dash_version is not None
- extra['cdl_dash'] = dict(version=dash_version)
- release.extra['cdl_dash'] = dict(version=dash_version)
+ extra["cdl_dash"] = dict(version=dash_version)
+ release.extra["cdl_dash"] = dict(version=dash_version)
manifest = make_manifest(dat_path + "/files/")
bundle_url = dict(
url="https://merritt.cdlib.org/u/{}/{}".format(
- urllib.parse.quote(ark_id, safe=''),
- dash_version),
- rel="repo-bundle")
+ urllib.parse.quote(ark_id, safe=""), dash_version
+ ),
+ rel="repo-bundle",
+ )
repo_url = dict(
url="https://merritt.cdlib.org/d/{}/{}/".format(
- urllib.parse.quote(ark_id, safe=''),
- dash_version),
- rel="repo")
- dat_url = dict(
- url="dat://{}/files/".format(dat_discovery),
- rel="dweb")
+ urllib.parse.quote(ark_id, safe=""), dash_version
+ ),
+ rel="repo",
+ )
+ dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
fs = FilesetEntity(
- urls=[bundle_url, repo_url, dat_url],
- release_ids=None,
- manifest=manifest,
- extra=extra)
+ urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
+ )
return (release, fs)
+
def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None):
- git_rev = subprocess.check_output(
- ["git", "describe", "--always"]).strip().decode('utf-8')
+ git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
(release, fileset) = make_release_fileset(dat_path)
if not editgroup_id:
- eg = api.create_editgroup(Editgroup(
- description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
- extra=dict(
- git_rev=git_rev,
- agent="fatcat_tools.auto_cdl_dash_dat")))
+ eg = api.create_editgroup(
+ Editgroup(
+ description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
+ extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
+ )
+ )
editgroup_id = eg.editgroup_id
if not release_id and release.ext_ids.doi:
@@ -201,6 +205,7 @@ def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None):
fileset = api.get_fileset(edit.ident)
return (editgroup_id, release, fileset)
-if __name__=='__main__':
+
+if __name__ == "__main__":
# pass this a discovery key that has been cloned to the local directory
print(make_release_fileset(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 0b634e73..8d2a89b6 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from .common import EntityImporter, clean
@@ -15,20 +14,19 @@ class ChoculaImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of container-level metadata from Chocula tool.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of container-level metadata from Chocula tool.",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, raw_record):
- if not raw_record.get('ident') and not raw_record.get('_known_issnl'):
- self.counts['skip-unknown-new-issnl'] += 1
+ if not raw_record.get("ident") and not raw_record.get("_known_issnl"):
+ self.counts["skip-unknown-new-issnl"] += 1
return False
- if raw_record.get('issnl') and raw_record.get('name'):
+ if raw_record.get("issnl") and raw_record.get("name"):
return True
return False
@@ -39,42 +37,55 @@ class ChoculaImporter(EntityImporter):
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
- name = clean(row.get('name'))
+ name = clean(row.get("name"))
if not name:
# Name is required (by schema)
return None
name = name.strip()
- if name.endswith(', Proceedings of the'):
- name = "Proceedings of the " + name.split(',')[0]
+ if name.endswith(", Proceedings of the"):
+ name = "Proceedings of the " + name.split(",")[0]
- if name.endswith('.'):
+ if name.endswith("."):
name = name[:-1]
extra = dict()
- for k in ('urls', 'webarchive_urls', 'country',
- 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages',
- 'ia', 'scielo', 'kbart', 'publisher_type', 'platform'):
- if row['extra'].get(k):
- extra[k] = row['extra'][k]
+ for k in (
+ "urls",
+ "webarchive_urls",
+ "country",
+ "sherpa_romeo",
+ "ezb",
+ "szczepanski",
+ "doaj",
+ "languages",
+ "ia",
+ "scielo",
+ "kbart",
+ "publisher_type",
+ "platform",
+ ):
+ if row["extra"].get(k):
+ extra[k] = row["extra"][k]
container_type = None
- if 'proceedings' in name.lower():
- container_type = 'proceedings'
- elif 'journal ' in name.lower():
- container_type = 'journal'
+ if "proceedings" in name.lower():
+ container_type = "proceedings"
+ elif "journal " in name.lower():
+ container_type = "journal"
ce = fatcat_openapi_client.ContainerEntity(
- issnl=row['issnl'],
- issnp=row['extra'].get('issnp'),
- issne=row['extra'].get('issne'),
- ident=row['ident'],
+ issnl=row["issnl"],
+ issnp=row["extra"].get("issnp"),
+ issne=row["extra"].get("issne"),
+ ident=row["ident"],
name=name,
container_type=container_type,
- publisher=clean(row.get('publisher')),
- wikidata_qid=row.get('wikidata_qid'),
- extra=extra)
+ publisher=clean(row.get("publisher")),
+ wikidata_qid=row.get("wikidata_qid"),
+ extra=extra,
+ )
return ce
def try_update(self, ce):
@@ -86,12 +97,12 @@ class ChoculaImporter(EntityImporter):
except fatcat_openapi_client.rest.ApiException as err:
if err.status != 404:
raise err
- self.counts['exists'] += 1
- self.counts['exists-not-found'] += 1
+ self.counts["exists"] += 1
+ self.counts["exists-not-found"] += 1
return False
- if existing.state != 'active':
- self.counts['exists'] += 1
- self.counts['exists-inactive'] += 1
+ if existing.state != "active":
+ self.counts["exists"] += 1
+ self.counts["exists-inactive"] += 1
return False
if not existing:
@@ -102,8 +113,8 @@ class ChoculaImporter(EntityImporter):
if err.status != 404:
raise err
if existing:
- self.counts['exists'] += 1
- self.counts['exists-by-issnl'] += 1
+ self.counts["exists"] += 1
+ self.counts["exists-by-issnl"] += 1
return False
# doesn't exist, always create
return True
@@ -111,18 +122,22 @@ class ChoculaImporter(EntityImporter):
# decide whether to update
do_update = False
if not self.do_updates:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if not existing.extra:
existing.extra = dict()
- if ce.extra.get('urls') and set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
+ if ce.extra.get("urls") and set(ce.extra.get("urls", [])) != set(
+ existing.extra.get("urls", [])
+ ):
do_update = True
- if ce.extra.get('webarchive_urls') and set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])):
+ if ce.extra.get("webarchive_urls") and set(ce.extra.get("webarchive_urls", [])) != set(
+ existing.extra.get("webarchive_urls", [])
+ ):
do_update = True
- for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'):
+ for k in ("ezb", "szczepanski", "publisher_type", "platform"):
if ce.extra.get(k) and not existing.extra.get(k):
do_update = True
- for k in ('kbart', 'ia', 'doaj'):
+ for k in ("kbart", "ia", "doaj"):
# always update these fields if not equal (chocula override)
if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k):
do_update = True
@@ -137,41 +152,53 @@ class ChoculaImporter(EntityImporter):
existing.container_type = existing.container_type or ce.container_type
existing.issne = existing.issne or ce.issne
existing.issnp = existing.issnp or ce.issnp
- for k in ('urls', 'webarchive_urls'):
+ for k in ("urls", "webarchive_urls"):
# be conservative about URL updates; don't clobber existing URL lists
# may want to make this behavior more sophisticated in the
# future, or at least a config flag
if ce.extra.get(k) and not existing.extra.get(k):
existing.extra[k] = ce.extra.get(k, [])
- for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia',
- 'scielo', 'kbart', 'publisher_type', 'platform'):
+ for k in (
+ "sherpa_romeo",
+ "ezb",
+ "szczepanski",
+ "doaj",
+ "ia",
+ "scielo",
+ "kbart",
+ "publisher_type",
+ "platform",
+ ):
# always update (chocula over-rides)
if ce.extra.get(k):
existing.extra[k] = ce.extra[k]
- for k in ('country',):
+ for k in ("country",):
# only include if not set (don't clobber human edits)
if ce.extra.get(k) and not existing.extra.get(k):
existing.extra[k] = ce.extra[k]
- if ce.extra.get('languages'):
- if not existing.extra.get('languages'):
- existing.extra['languages'] = ce.extra['languages']
- elif not ce.extra['languages'][0] in existing.extra['languages']:
- existing.extra['languages'].append(ce.extra['languages'][0])
+ if ce.extra.get("languages"):
+ if not existing.extra.get("languages"):
+ existing.extra["languages"] = ce.extra["languages"]
+ elif not ce.extra["languages"][0] in existing.extra["languages"]:
+ existing.extra["languages"].append(ce.extra["languages"][0])
self.api.update_container(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
else:
- self.counts['exists'] += 1
- self.counts['exists-skip-update'] += 1
+ self.counts["exists"] += 1
+ self.counts["exists-skip-update"] += 1
return False
# if we got this far, it's a bug
raise NotImplementedError
def insert_batch(self, batch):
- self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_container_auto_batch(
+ fatcat_openapi_client.ContainerAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index e33a2012..2639c85a 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -1,4 +1,3 @@
-
import csv
import datetime
import json
@@ -34,7 +33,6 @@ SANE_MAX_URLS: int = 100
DOMAIN_REL_MAP: Dict[str, str] = {
"archive.org": "archive",
# LOCKSS, Portico, DuraSpace, etc would also be "archive"
-
"arxiv.org": "repository",
"babel.hathitrust.org": "repository",
"cds.cern.ch": "repository",
@@ -53,7 +51,6 @@ DOMAIN_REL_MAP: Dict[str, str] = {
"zenodo.org": "repository",
"www.biorxiv.org": "repository",
"www.medrxiv.org": "repository",
-
"citeseerx.ist.psu.edu": "aggregator",
"publisher-connector.core.ac.uk": "aggregator",
"core.ac.uk": "aggregator",
@@ -62,7 +59,6 @@ DOMAIN_REL_MAP: Dict[str, str] = {
"pdfs.semanticscholar.org": "aggregator",
"semanticscholar.org": "aggregator",
"www.semanticscholar.org": "aggregator",
-
"academic.oup.com": "publisher",
"cdn.elifesciences.org": "publisher",
"cell.com": "publisher",
@@ -86,15 +82,14 @@ DOMAIN_REL_MAP: Dict[str, str] = {
"ehp.niehs.nih.gov": "publisher",
"journals.tsu.ru": "publisher",
"www.cogentoa.com": "publisher",
-
"www.researchgate.net": "academicsocial",
"academia.edu": "academicsocial",
-
"wayback.archive-it.org": "webarchive",
"web.archive.org": "webarchive",
"archive.is": "webarchive",
}
+
def make_rel_url(raw_url: str, default_link_rel: str = "web"):
# this is where we map specific domains to rel types, and also filter out
# bad domains, invalid URLs, etc
@@ -105,12 +100,17 @@ def make_rel_url(raw_url: str, default_link_rel: str = "web"):
break
return (rel, raw_url)
+
def test_make_rel_url():
assert make_rel_url("http://example.com/thing.pdf")[0] == "web"
assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans"
- assert make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] == "webarchive"
+ assert (
+ make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0]
+ == "webarchive"
+ )
assert make_rel_url("http://cell.com/thing.pdf")[0] == "publisher"
+
class EntityImporter:
"""
Base class for fatcat entity importers.
@@ -147,23 +147,26 @@ class EntityImporter:
def __init__(self, api, **kwargs):
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['git_rev'] = eg_extra.get('git_rev',
- subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["git_rev"] = eg_extra.get(
+ "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip()
+ ).decode("utf-8")
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityImporter")
self.api = api
- self.do_updates = bool(kwargs.get('do_updates', True))
- self.do_fuzzy_match: bool = kwargs.get('do_fuzzy_match', True)
- self.bezerk_mode: bool = kwargs.get('bezerk_mode', False)
- self.submit_mode: bool = kwargs.get('submit_mode', False)
- self.edit_batch_size: int = kwargs.get('edit_batch_size', 100)
- self.editgroup_description: Optional[str] = kwargs.get('editgroup_description')
+ self.do_updates = bool(kwargs.get("do_updates", True))
+ self.do_fuzzy_match: bool = kwargs.get("do_fuzzy_match", True)
+ self.bezerk_mode: bool = kwargs.get("bezerk_mode", False)
+ self.submit_mode: bool = kwargs.get("submit_mode", False)
+ self.edit_batch_size: int = kwargs.get("edit_batch_size", 100)
+ self.editgroup_description: Optional[str] = kwargs.get("editgroup_description")
self.editgroup_extra: Optional[Any] = eg_extra
- self.es_client = kwargs.get('es_client')
+ self.es_client = kwargs.get("es_client")
if not self.es_client:
- self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120)
+ self.es_client = elasticsearch.Elasticsearch(
+ "https://search.fatcat.wiki", timeout=120
+ )
self._issnl_id_map: Dict[str, Any] = dict()
self._orcid_id_map: Dict[str, Any] = dict()
@@ -174,7 +177,7 @@ class EntityImporter:
self.reset()
def reset(self) -> None:
- self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+ self.counts = Counter({"total": 0, "skip": 0, "insert": 0, "update": 0, "exists": 0})
self._edit_count: int = 0
self._editgroup_id: Optional[str] = None
self._entity_queue: List[Any] = []
@@ -184,13 +187,13 @@ class EntityImporter:
"""
Returns nothing.
"""
- self.counts['total'] += 1
+ self.counts["total"] += 1
if (not raw_record) or (not self.want(raw_record)):
- self.counts['skip'] += 1
+ self.counts["skip"] += 1
return
entity = self.parse_record(raw_record)
if not entity:
- self.counts['skip'] += 1
+ self.counts["skip"] += 1
return
if self.bezerk_mode:
self.push_entity(entity)
@@ -230,7 +233,7 @@ class EntityImporter:
if self._entity_queue:
self.insert_batch(self._entity_queue)
- self.counts['insert'] += len(self._entity_queue)
+ self.counts["insert"] += len(self._entity_queue)
self._entity_queue = []
return self.counts
@@ -248,8 +251,9 @@ class EntityImporter:
if not self._editgroup_id:
eg = self.api.create_editgroup(
fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
self._editgroup_id = eg.editgroup_id
self._edit_count += edits
@@ -257,30 +261,30 @@ class EntityImporter:
def create_container(self, entity):
eg_id = self.get_editgroup_id()
- self.counts['inserted.container'] += 1
+ self.counts["inserted.container"] += 1
return self.api.create_container(eg_id, entity)
def create_release(self, entity):
eg_id = self.get_editgroup_id()
- self.counts['inserted.release'] += 1
+ self.counts["inserted.release"] += 1
return self.api.create_release(eg_id, entity)
def create_file(self, entity):
eg_id = self.get_editgroup_id()
- self.counts['inserted.file'] += 1
+ self.counts["inserted.file"] += 1
return self.api.create_file(eg_id, entity)
def updated(self):
"""
Implementations should call this from try_update() if the update was successful
"""
- self.counts['update'] += 1
+ self.counts["update"] += 1
def push_entity(self, entity):
self._entity_queue.append(entity)
if len(self._entity_queue) >= self.edit_batch_size:
self.insert_batch(self._entity_queue)
- self.counts['insert'] += len(self._entity_queue)
+ self.counts["insert"] += len(self._entity_queue)
self._entity_queue = []
def want(self, raw_record: Any) -> bool:
@@ -324,7 +328,7 @@ class EntityImporter:
# If anything other than a 404 (not found), something is wrong
if ae.status != 404:
raise ae
- self._orcid_id_map[orcid] = creator_id # might be None
+ self._orcid_id_map[orcid] = creator_id # might be None
return creator_id
def is_doi(self, doi: str) -> bool:
@@ -347,7 +351,7 @@ class EntityImporter:
# If anything other than a 404 (not found), something is wrong
if ae.status != 404:
raise ae
- self._doi_id_map[doi] = release_id # might be None
+ self._doi_id_map[doi] = release_id # might be None
return release_id
def lookup_pmid(self, pmid: str):
@@ -364,11 +368,11 @@ class EntityImporter:
# If anything other than a 404 (not found), something is wrong
if ae.status != 404:
raise ae
- self._pmid_id_map[pmid] = release_id # might be None
+ self._pmid_id_map[pmid] = release_id # might be None
return release_id
def is_issnl(self, issnl: str) -> bool:
- return len(issnl) == 9 and issnl[4] == '-'
+ return len(issnl) == 9 and issnl[4] == "-"
def lookup_issnl(self, issnl: str):
"""Caches calls to the ISSN-L lookup API endpoint in a local dict"""
@@ -382,7 +386,7 @@ class EntityImporter:
# If anything other than a 404 (not found), something is wrong
if ae.status != 404:
raise ae
- self._issnl_id_map[issnl] = container_id # might be None
+ self._issnl_id_map[issnl] = container_id # might be None
return container_id
def read_issn_map_file(self, issn_map_file):
@@ -417,26 +421,26 @@ class EntityImporter:
# update old/deprecated 'rel' on URLs
for i in range(len(existing.urls)):
u = existing.urls[i]
- if u.rel == 'repository' and '://archive.org/download/' in u.url:
- existing.urls[i].rel = 'archive'
- if u.rel == 'social':
- u.rel = 'academicsocial'
+ if u.rel == "repository" and "://archive.org/download/" in u.url:
+ existing.urls[i].rel = "archive"
+ if u.rel == "social":
+ u.rel = "academicsocial"
# remove URLs which are near-duplicates
redundant_urls = []
all_urls = [u.url for u in existing.urls]
- all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url]
+ all_wayback_urls = [u.url for u in existing.urls if "://web.archive.org/web/" in u.url]
for url in all_urls:
# https/http redundancy
- if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls:
+ if url.startswith("http://") and url.replace("http://", "https://", 1) in all_urls:
redundant_urls.append(url)
continue
# default HTTP port included and not included
- if ':80/' in url and url.replace(':80', '', 1) in all_urls:
+ if ":80/" in url and url.replace(":80", "", 1) in all_urls:
redundant_urls.append(url)
continue
# partial and complete wayback timestamps
- if '://web.archive.org/web/2017/' in url:
+ if "://web.archive.org/web/2017/" in url:
original_url = "/".join(url.split("/")[5:])
assert len(original_url) > 5
for wb_url in all_wayback_urls:
@@ -452,7 +456,9 @@ class EntityImporter:
def generic_fileset_cleanups(existing):
return existing
- def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]:
+ def match_existing_release_fuzzy(
+ self, release: ReleaseEntity
+ ) -> Optional[Tuple[str, str, ReleaseEntity]]:
"""
This helper function uses fuzzycat (and elasticsearch) to look for
existing release entities with similar metadata.
@@ -488,7 +494,15 @@ class EntityImporter:
return None
release_dict = entity_to_dict(release, api_client=self.api.api_client)
- verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates]
+ verified = [
+ (
+ fuzzycat.verify.verify(
+ release_dict, entity_to_dict(c, api_client=self.api.api_client)
+ ),
+ c,
+ )
+ for c in candidates
+ ]
# chose the "closest" match
closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0]
@@ -522,7 +536,6 @@ class RecordPusher:
class JsonLinePusher(RecordPusher):
-
def __init__(self, importer, json_file, **kwargs):
self.importer = importer
self.json_file = json_file
@@ -539,10 +552,9 @@ class JsonLinePusher(RecordPusher):
class CsvPusher(RecordPusher):
-
def __init__(self, importer, csv_file, **kwargs):
self.importer = importer
- self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ','))
+ self.reader = csv.DictReader(csv_file, delimiter=kwargs.get("delimiter", ","))
def run(self):
for line in self.reader:
@@ -555,7 +567,6 @@ class CsvPusher(RecordPusher):
class LinePusher(RecordPusher):
-
def __init__(self, importer, text_file, **kwargs):
self.importer = importer
self.text_file = text_file
@@ -571,17 +582,15 @@ class LinePusher(RecordPusher):
class SqlitePusher(RecordPusher):
-
def __init__(self, importer, db_file, table_name, where_clause="", **kwargs):
self.importer = importer
- self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+ self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
self.db.row_factory = sqlite3.Row
self.table_name = table_name
self.where_clause = where_clause
def run(self):
- cur = self.db.execute("SELECT * FROM {} {};".format(
- self.table_name, self.where_clause))
+ cur = self.db.execute("SELECT * FROM {} {};".format(self.table_name, self.where_clause))
for row in cur:
self.importer.push_record(row)
counts = self.importer.finish()
@@ -590,7 +599,6 @@ class SqlitePusher(RecordPusher):
class Bs4XmlLinesPusher(RecordPusher):
-
def __init__(self, importer, xml_file, prefix_filter=None, **kwargs):
self.importer = importer
self.xml_file = xml_file
@@ -611,7 +619,6 @@ class Bs4XmlLinesPusher(RecordPusher):
class Bs4XmlFilePusher(RecordPusher):
-
def __init__(self, importer, xml_file, record_tag, **kwargs):
self.importer = importer
self.xml_file = xml_file
@@ -684,7 +691,6 @@ class Bs4XmlLargeFilePusher(RecordPusher):
class Bs4XmlFileListPusher(RecordPusher):
-
def __init__(self, importer, list_file, record_tag, **kwargs):
self.importer = importer
self.list_file = list_file
@@ -695,7 +701,7 @@ class Bs4XmlFileListPusher(RecordPusher):
xml_path = xml_path.strip()
if not xml_path or xml_path.startswith("#"):
continue
- with open(xml_path, 'r') as xml_file:
+ with open(xml_path, "r") as xml_file:
soup = BeautifulSoup(xml_file, "xml")
for record in soup.find_all(self.record_tag):
self.importer.push_record(record)
@@ -705,10 +711,12 @@ class Bs4XmlFileListPusher(RecordPusher):
print(counts)
return counts
+
class KafkaBs4XmlPusher(RecordPusher):
"""
Fetch XML for an article from Kafka, parse via Bs4.
"""
+
def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
self.importer = importer
self.consumer = make_kafka_consumer(
@@ -716,10 +724,10 @@ class KafkaBs4XmlPusher(RecordPusher):
kafka_env,
topic_suffix,
group,
- kafka_namespace=kwargs.get('kafka_namespace', 'fatcat')
+ kafka_namespace=kwargs.get("kafka_namespace", "fatcat"),
)
- self.poll_interval = kwargs.get('poll_interval', 5.0)
- self.consume_batch_size = kwargs.get('consume_batch_size', 25)
+ self.poll_interval = kwargs.get("poll_interval", 5.0)
+ self.consume_batch_size = kwargs.get("consume_batch_size", 25)
def run(self):
count = 0
@@ -735,16 +743,19 @@ class KafkaBs4XmlPusher(RecordPusher):
# outstanding editgroups every 5 minutes, but there is still that
# window when editgroups might be hanging (unsubmitted).
batch = self.consumer.consume(
- num_messages=self.consume_batch_size,
- timeout=self.poll_interval)
- print("... got {} kafka messages ({}sec poll interval) {}".format(
- len(batch), self.poll_interval, self.importer.counts))
+ num_messages=self.consume_batch_size, timeout=self.poll_interval
+ )
+ print(
+ "... got {} kafka messages ({}sec poll interval) {}".format(
+ len(batch), self.poll_interval, self.importer.counts
+ )
+ )
if not batch:
if datetime.datetime.now() - last_push > datetime.timedelta(minutes=5):
# it has been some time, so flush any current editgroup
self.importer.finish()
last_push = datetime.datetime.now()
- #print("Flushed any partial import batch: {}".format(self.importer.counts))
+ # print("Flushed any partial import batch: {}".format(self.importer.counts))
continue
# first check errors on entire batch...
for msg in batch:
@@ -752,7 +763,7 @@ class KafkaBs4XmlPusher(RecordPusher):
raise KafkaException(msg.error())
# ... then process
for msg in batch:
- soup = BeautifulSoup(msg.value().decode('utf-8'), "xml")
+ soup = BeautifulSoup(msg.value().decode("utf-8"), "xml")
self.importer.push_record(soup)
soup.decompose()
count += 1
@@ -771,8 +782,8 @@ class KafkaBs4XmlPusher(RecordPusher):
self.consumer.close()
return counts
-class KafkaJsonPusher(RecordPusher):
+class KafkaJsonPusher(RecordPusher):
def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
self.importer = importer
self.consumer = make_kafka_consumer(
@@ -780,11 +791,11 @@ class KafkaJsonPusher(RecordPusher):
kafka_env,
topic_suffix,
group,
- kafka_namespace=kwargs.get('kafka_namespace', 'fatcat')
+ kafka_namespace=kwargs.get("kafka_namespace", "fatcat"),
)
- self.poll_interval = kwargs.get('poll_interval', 5.0)
- self.consume_batch_size = kwargs.get('consume_batch_size', 100)
- self.force_flush = kwargs.get('force_flush', False)
+ self.poll_interval = kwargs.get("poll_interval", 5.0)
+ self.consume_batch_size = kwargs.get("consume_batch_size", 100)
+ self.force_flush = kwargs.get("force_flush", False)
def run(self):
count = 0
@@ -801,10 +812,13 @@ class KafkaJsonPusher(RecordPusher):
# outstanding editgroups every 5 minutes, but there is still that
# window when editgroups might be hanging (unsubmitted).
batch = self.consumer.consume(
- num_messages=self.consume_batch_size,
- timeout=self.poll_interval)
- print("... got {} kafka messages ({}sec poll interval) {}".format(
- len(batch), self.poll_interval, self.importer.counts))
+ num_messages=self.consume_batch_size, timeout=self.poll_interval
+ )
+ print(
+ "... got {} kafka messages ({}sec poll interval) {}".format(
+ len(batch), self.poll_interval, self.importer.counts
+ )
+ )
if self.force_flush:
# this flushing happens even if there have been 'push' events
# more recently. it is intended for, eg, importers off the
@@ -821,7 +835,7 @@ class KafkaJsonPusher(RecordPusher):
self.importer.finish()
last_push = datetime.datetime.now()
last_force_flush = datetime.datetime.now()
- #print("Flushed any partial import batch: {}".format(self.importer.counts))
+ # print("Flushed any partial import batch: {}".format(self.importer.counts))
continue
# first check errors on entire batch...
for msg in batch:
@@ -829,7 +843,7 @@ class KafkaJsonPusher(RecordPusher):
raise KafkaException(msg.error())
# ... then process
for msg in batch:
- record = json.loads(msg.value().decode('utf-8'))
+ record = json.loads(msg.value().decode("utf-8"))
self.importer.push_record(record)
count += 1
if count % 500 == 0:
@@ -864,25 +878,25 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat
print("Bailing out...")
# TODO: should it be sys.exit(-1)?
raise KafkaException(p.error)
- #print("Kafka consumer commit successful")
+ # print("Kafka consumer commit successful")
pass
# previously, using pykafka
- #auto_commit_enable=True,
- #auto_commit_interval_ms=30000, # 30 seconds
+ # auto_commit_enable=True,
+ # auto_commit_interval_ms=30000, # 30 seconds
conf = {
- 'bootstrap.servers': hosts,
- 'group.id': group,
- 'on_commit': fail_fast,
+ "bootstrap.servers": hosts,
+ "group.id": group,
+ "on_commit": fail_fast,
# messages don't have offset marked as stored until pushed to
# elastic, but we do auto-commit stored offsets to broker
- 'enable.auto.offset.store': False,
- 'enable.auto.commit': True,
+ "enable.auto.offset.store": False,
+ "enable.auto.commit": True,
# user code timeout; if no poll after this long, assume user code
# hung and rebalance (default: 5min)
- 'max.poll.interval.ms': 120000,
- 'default.topic.config': {
- 'auto.offset.reset': 'latest',
+ "max.poll.interval.ms": 120000,
+ "default.topic.config": {
+ "auto.offset.reset": "latest",
},
}
@@ -890,13 +904,13 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat
for p in partitions:
if p.error:
raise KafkaException(p.error)
- print("Kafka partitions rebalanced: {} / {}".format(
- consumer, partitions))
+ print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions))
consumer = Consumer(conf)
# NOTE: it's actually important that topic_name *not* be bytes (UTF-8
# encoded)
- consumer.subscribe([topic_name],
+ consumer.subscribe(
+ [topic_name],
on_assign=on_rebalance,
on_revoke=on_rebalance,
)
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index fd6936a4..606d4bb1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,4 +1,3 @@
-
import datetime
import sqlite3
from typing import Any, Dict, Optional
@@ -13,30 +12,30 @@ from .common import EntityImporter, clean
# Can get a list of Crossref types (with counts) via API:
# https://api.crossref.org/works?rows=0&facet=type-name:*
CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
- 'book': 'book',
- 'book-chapter': 'chapter',
- 'book-part': 'chapter',
- 'book-section': 'chapter',
- 'component': 'component',
- 'dataset': 'dataset',
- 'dissertation': 'thesis',
- 'edited-book': 'book',
- 'journal-article': 'article-journal',
- 'monograph': 'book',
- 'other': None,
- 'peer-review': 'peer_review',
- 'posted-content': 'post',
- 'proceedings-article': 'paper-conference',
- 'reference-book': 'book',
- 'reference-entry': 'entry',
- 'report': 'report',
- 'standard': 'standard',
+ "book": "book",
+ "book-chapter": "chapter",
+ "book-part": "chapter",
+ "book-section": "chapter",
+ "component": "component",
+ "dataset": "dataset",
+ "dissertation": "thesis",
+ "edited-book": "book",
+ "journal-article": "article-journal",
+ "monograph": "book",
+ "other": None,
+ "peer-review": "peer_review",
+ "posted-content": "post",
+ "proceedings-article": "paper-conference",
+ "reference-book": "book",
+ "reference-entry": "entry",
+ "report": "report",
+ "standard": "standard",
}
CONTAINER_TYPE_MAP: Dict[str, str] = {
- 'article-journal': 'journal',
- 'paper-conference': 'conference',
- 'book': 'book-series',
+ "article-journal": "journal",
+ "paper-conference": "conference",
+ "book": "book-series",
}
# These are based, informally, on sorting the most popular licenses found in
@@ -90,29 +89,41 @@ LICENSE_SLUG_MAP: Dict[str, str] = {
"//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
}
+
def lookup_license_slug(raw: str) -> Optional[str]:
if not raw:
return None
- raw = raw.strip().replace('http://', '//').replace('https://', '//')
- if 'creativecommons.org' in raw.lower():
+ raw = raw.strip().replace("http://", "//").replace("https://", "//")
+ if "creativecommons.org" in raw.lower():
raw = raw.lower()
- raw = raw.replace('/legalcode', '/').replace('/uk', '')
- if not raw.endswith('/'):
- raw = raw + '/'
+ raw = raw.replace("/legalcode", "/").replace("/uk", "")
+ if not raw.endswith("/"):
+ raw = raw + "/"
return LICENSE_SLUG_MAP.get(raw)
+
def test_lookup_license_slug():
assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
- assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY"
- assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0"
+ assert (
+ lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+ == "CC-BY"
+ )
+ assert (
+ lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+ == "CC-0"
+ )
assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
- assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA"
+ assert (
+ lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+ == "CC-BY-NC-SA"
+ )
assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
assert lookup_license_slug("") is None
assert lookup_license_slug(None) is None
+
class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
@@ -124,18 +135,22 @@ class CrossrefImporter(EntityImporter):
def __init__(self, api, issn_map_file, **kwargs):
- eg_desc: Optional[str] = kwargs.get('editgroup_description',
- "Automated import of Crossref DOI metadata, harvested from REST API")
- eg_extra: Optional[dict] = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter')
- super().__init__(api,
+ eg_desc: Optional[str] = kwargs.get(
+ "editgroup_description",
+ "Automated import of Crossref DOI metadata, harvested from REST API",
+ )
+ eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
- self.create_containers: bool = kwargs.get('create_containers', True)
- extid_map_file = kwargs.get('extid_map_file')
+ self.create_containers: bool = kwargs.get("create_containers", True)
+ extid_map_file = kwargs.get("extid_map_file")
self.extid_map_db: Optional[Any] = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -148,12 +163,27 @@ class CrossrefImporter(EntityImporter):
def lookup_ext_ids(self, doi: str) -> Optional[Any]:
if self.extid_map_db is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
- [doi.lower()]).fetchone()
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = self.extid_map_db.execute(
+ "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+ ).fetchone()
if row is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = [str(cell or "") or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
@@ -173,17 +203,17 @@ class CrossrefImporter(EntityImporter):
return CONTAINER_TYPE_MAP.get(crossref_type)
def want(self, obj: Dict[str, Any]) -> bool:
- if not obj.get('title'):
- self.counts['skip-blank-title'] += 1
+ if not obj.get("title"):
+ self.counts["skip-blank-title"] += 1
return False
# these are pre-registered DOIs before the actual record is ready
# title is a list of titles
- titles = obj.get('title')
+ titles = obj.get("title")
if titles is not None and titles[0].strip().lower() in [
- "OUP accepted manuscript".lower(),
- ]:
- self.counts['skip-stub-title'] += 1
+ "OUP accepted manuscript".lower(),
+ ]:
+ self.counts["skip-stub-title"] += 1
return False
# do most of these checks in-line below
@@ -197,86 +227,105 @@ class CrossrefImporter(EntityImporter):
# Ways to be out of scope (provisionally)
# journal-issue and journal-volume map to None, but allowed for now
- if obj.get('type') in (None, 'journal', 'proceedings',
- 'standard-series', 'report-series', 'book-series', 'book-set',
- 'book-track', 'proceedings-series'):
- self.counts['skip-release-type'] += 1
+ if obj.get("type") in (
+ None,
+ "journal",
+ "proceedings",
+ "standard-series",
+ "report-series",
+ "book-series",
+ "book-set",
+ "book-track",
+ "proceedings-series",
+ ):
+ self.counts["skip-release-type"] += 1
return None
# Do require the 'title' keys to exist, as release entities do
- if ('title' not in obj) or (not obj['title']):
- self.counts['skip-blank-title'] += 1
+ if ("title" not in obj) or (not obj["title"]):
+ self.counts["skip-blank-title"] += 1
return None
- release_type = self.map_release_type(obj['type'])
+ release_type = self.map_release_type(obj["type"])
# contribs
def do_contribs(obj_list, ctype):
contribs = []
for i, am in enumerate(obj_list):
creator_id = None
- if 'ORCID' in am.keys():
- creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
+ if "ORCID" in am.keys():
+ creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1])
# Sorry humans :(
- if am.get('given') and am.get('family'):
- raw_name = "{} {}".format(am['given'], am['family'])
- elif am.get('family'):
- raw_name = am['family']
+ if am.get("given") and am.get("family"):
+ raw_name = "{} {}".format(am["given"], am["family"])
+ elif am.get("family"):
+ raw_name = am["family"]
else:
# TODO: can end up empty
- raw_name = am.get('name') or am.get('given')
+ raw_name = am.get("name") or am.get("given")
extra = dict()
if ctype == "author":
index = i
else:
index = None
raw_affiliation = None
- if am.get('affiliation'):
- if len(am.get('affiliation')) > 0:
- raw_affiliation = am.get('affiliation')[0]['name']
- if len(am.get('affiliation')) > 1:
+ if am.get("affiliation"):
+ if len(am.get("affiliation")) > 0:
+ raw_affiliation = am.get("affiliation")[0]["name"]
+ if len(am.get("affiliation")) > 1:
# note: affiliation => more_affiliations
- extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
- if am.get('sequence') and am.get('sequence') != "additional":
- extra['seq'] = clean(am.get('sequence'))
+ extra["more_affiliations"] = [
+ clean(a["name"]) for a in am.get("affiliation")[1:]
+ ]
+ if am.get("sequence") and am.get("sequence") != "additional":
+ extra["seq"] = clean(am.get("sequence"))
if not extra:
extra = None
assert ctype in ("author", "editor", "translator")
raw_name = clean(raw_name)
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- creator_id=creator_id,
- index=index,
- raw_name=raw_name,
- given_name=clean(am.get('given')),
- surname=clean(am.get('family')),
- raw_affiliation=clean(raw_affiliation),
- role=ctype,
- extra=extra))
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ creator_id=creator_id,
+ index=index,
+ raw_name=raw_name,
+ given_name=clean(am.get("given")),
+ surname=clean(am.get("family")),
+ raw_affiliation=clean(raw_affiliation),
+ role=ctype,
+ extra=extra,
+ )
+ )
return contribs
- contribs = do_contribs(obj.get('author', []), "author")
- contribs.extend(do_contribs(obj.get('editor', []), "editor"))
- contribs.extend(do_contribs(obj.get('translator', []), "translator"))
+
+ contribs = do_contribs(obj.get("author", []), "author")
+ contribs.extend(do_contribs(obj.get("editor", []), "editor"))
+ contribs.extend(do_contribs(obj.get("translator", []), "translator"))
# container
- issn = obj.get('ISSN', [None])[0]
+ issn = obj.get("ISSN", [None])[0]
issnl = self.issn2issnl(issn)
container_id = None
if issnl:
container_id = self.lookup_issnl(issnl)
- publisher = clean(obj.get('publisher'))
+ publisher = clean(obj.get("publisher"))
- container_name = obj.get('container-title')
+ container_name = obj.get("container-title")
if container_name:
container_name = clean(container_name[0], force_xml=True)
if not container_name:
container_name = None
- if (container_id is None and self.create_containers and (issnl is not None)
- and container_name):
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and container_name
+ ):
ce = fatcat_openapi_client.ContainerEntity(
issnl=issnl,
publisher=publisher,
container_type=self.map_container_type(release_type),
- name=container_name)
+ name=container_name,
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
@@ -284,21 +333,21 @@ class CrossrefImporter(EntityImporter):
# license slug
license_slug = None
license_extra = []
- for lic in obj.get('license', []):
- if lic['content-version'] not in ('vor', 'unspecified'):
+ for lic in obj.get("license", []):
+ if lic["content-version"] not in ("vor", "unspecified"):
continue
- slug = lookup_license_slug(lic['URL'])
+ slug = lookup_license_slug(lic["URL"])
if slug:
license_slug = slug
- if 'start' in lic:
- lic['start'] = lic['start']['date-time']
+ if "start" in lic:
+ lic["start"] = lic["start"]["date-time"]
license_extra.append(lic)
# references
refs = []
- for i, rm in enumerate(obj.get('reference', [])):
+ for i, rm in enumerate(obj.get("reference", [])):
try:
- year: Optional[int] = int(rm.get('year'))
+ year: Optional[int] = int(rm.get("year"))
# TODO: will need to update/config in the future!
# NOTE: are there crossref works with year < 100?
if year is not None:
@@ -307,56 +356,78 @@ class CrossrefImporter(EntityImporter):
except (TypeError, ValueError):
year = None
ref_extra: Dict[str, Any] = dict()
- key = rm.get('key')
- if key and key.startswith(obj['DOI'].upper()):
- key = key.replace(obj['DOI'].upper() + "-", '')
- key = key.replace(obj['DOI'].upper(), '')
- ref_container_name = rm.get('volume-title')
+ key = rm.get("key")
+ if key and key.startswith(obj["DOI"].upper()):
+ key = key.replace(obj["DOI"].upper() + "-", "")
+ key = key.replace(obj["DOI"].upper(), "")
+ ref_container_name = rm.get("volume-title")
if not ref_container_name:
- ref_container_name = rm.get('journal-title')
- elif rm.get('journal-title'):
- ref_extra['journal-title'] = rm['journal-title']
- if rm.get('DOI'):
- ref_extra['doi'] = rm.get('DOI').lower()
- author = clean(rm.get('author'))
+ ref_container_name = rm.get("journal-title")
+ elif rm.get("journal-title"):
+ ref_extra["journal-title"] = rm["journal-title"]
+ if rm.get("DOI"):
+ ref_extra["doi"] = rm.get("DOI").lower()
+ author = clean(rm.get("author"))
if author:
- ref_extra['authors'] = [author]
- for k in ('editor', 'edition', 'authority', 'version', 'genre',
- 'url', 'event', 'issue', 'volume', 'date', 'accessed_date',
- 'issued', 'page', 'medium', 'collection_title', 'chapter_number',
- 'unstructured', 'series-title', 'volume-title'):
+ ref_extra["authors"] = [author]
+ for k in (
+ "editor",
+ "edition",
+ "authority",
+ "version",
+ "genre",
+ "url",
+ "event",
+ "issue",
+ "volume",
+ "date",
+ "accessed_date",
+ "issued",
+ "page",
+ "medium",
+ "collection_title",
+ "chapter_number",
+ "unstructured",
+ "series-title",
+ "volume-title",
+ ):
if clean(rm.get(k)):
ref_extra[k] = clean(rm[k])
if not ref_extra:
ref_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- index=i,
- # doing lookups would be a second import pass
- target_release_id=None,
- key=key,
- year=year,
- container_name=clean(ref_container_name),
- title=clean(rm.get('article-title')),
- locator=clean(rm.get('first-page')),
- # TODO: just dump JSON somewhere here?
- extra=ref_extra))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ index=i,
+ # doing lookups would be a second import pass
+ target_release_id=None,
+ key=key,
+ year=year,
+ container_name=clean(ref_container_name),
+ title=clean(rm.get("article-title")),
+ locator=clean(rm.get("first-page")),
+ # TODO: just dump JSON somewhere here?
+ extra=ref_extra,
+ )
+ )
# abstracts
abstracts = []
- abstract = clean(obj.get('abstract'))
+ abstract = clean(obj.get("abstract"))
if abstract and len(abstract) > 10:
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(
- mimetype="application/xml+jats",
- content=abstract))
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(
+ mimetype="application/xml+jats", content=abstract
+ )
+ )
# extra fields
extra = dict()
extra_crossref = dict()
# top-level extra keys
if not container_id:
- if obj.get('container-title'):
- extra['container_name'] = container_name
- for key in ('group-title'):
+ if obj.get("container-title"):
+ extra["container_name"] = container_name
+ for key in "group-title":
val = obj.get(key)
if val:
if type(val) == list:
@@ -368,7 +439,7 @@ class CrossrefImporter(EntityImporter):
else:
extra[key] = val
# crossref-nested extra keys
- for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'):
+ for key in ("subject", "type", "alternative-id", "archive", "funder"):
val = obj.get(key)
if val:
if type(val) == str:
@@ -376,46 +447,51 @@ class CrossrefImporter(EntityImporter):
else:
extra_crossref[key] = val
if license_extra:
- extra_crossref['license'] = license_extra
+ extra_crossref["license"] = license_extra
- if len(obj['title']) > 1:
- aliases = [clean(t) for t in obj['title'][1:]]
+ if len(obj["title"]) > 1:
+ aliases = [clean(t) for t in obj["title"][1:]]
aliases = [t for t in aliases if t]
if aliases:
- extra['aliases'] = aliases
+ extra["aliases"] = aliases
# ISBN
isbn13 = None
- for raw in obj.get('ISBN', []):
+ for raw in obj.get("ISBN", []):
# TODO: convert if not ISBN-13 format
if len(raw) == 17:
isbn13 = raw
break
# release status
- if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
- 'dissertation', 'book-chapter'):
+ if obj["type"] in (
+ "journal-article",
+ "conference-proceeding",
+ "book",
+ "dissertation",
+ "book-chapter",
+ ):
release_stage = "published"
else:
# unknown
release_stage = None
# external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj['DOI'].lower())
+ extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower())
# filter out unreasonably huge releases
if len(abstracts) > 100:
- self.counts['skip-huge-abstracts'] += 1
+ self.counts["skip-huge-abstracts"] += 1
return None
if len(contribs) > 2000:
- self.counts['skip-huge-contribs'] += 1
+ self.counts["skip-huge-contribs"] += 1
return None
if len(refs) > 5000:
- self.counts['skip-huge-refs'] += 1
+ self.counts["skip-huge-refs"] += 1
return None
# release date parsing is amazingly complex
- raw_date = obj['issued']['date-parts'][0]
+ raw_date = obj["issued"]["date-parts"][0]
if not raw_date or not raw_date[0]:
# got some NoneType, even though at least year is supposed to be set
release_year = None
@@ -429,28 +505,28 @@ class CrossrefImporter(EntityImporter):
release_date = None
original_title: Optional[str] = None
- if obj.get('original-title'):
- ot = obj.get('original-title')
+ if obj.get("original-title"):
+ ot = obj.get("original-title")
if ot is not None:
original_title = clean(ot[0], force_xml=True)
title: Optional[str] = None
- if obj.get('title'):
- title = clean(obj.get('title')[0], force_xml=True)
+ if obj.get("title"):
+ title = clean(obj.get("title")[0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
- self.counts['skip-blank-title'] += 1
+ self.counts["skip-blank-title"] += 1
return None
subtitle = None
- if obj.get('subtitle'):
- subtitle = clean(obj.get('subtitle')[0], force_xml=True)
+ if obj.get("subtitle"):
+ subtitle = clean(obj.get("subtitle")[0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
subtitle = None
if extra_crossref:
- extra['crossref'] = extra_crossref
+ extra["crossref"] = extra_crossref
if not extra:
extra = None
@@ -466,19 +542,19 @@ class CrossrefImporter(EntityImporter):
release_year=release_year,
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
- doi=obj['DOI'].lower(),
- pmid=extids['pmid'],
- pmcid=extids['pmcid'],
- wikidata_qid=extids['wikidata_qid'],
+ doi=obj["DOI"].lower(),
+ pmid=extids["pmid"],
+ pmcid=extids["pmcid"],
+ wikidata_qid=extids["wikidata_qid"],
isbn13=isbn13,
- core=extids['core_id'],
- arxiv=extids['arxiv_id'],
- jstor=extids['jstor_id'],
+ core=extids["core_id"],
+ arxiv=extids["arxiv_id"],
+ jstor=extids["jstor_id"],
),
- volume=clean(obj.get('volume')),
- issue=clean(obj.get('issue')),
- pages=clean(obj.get('page')),
- language=clean(obj.get('language')),
+ volume=clean(obj.get("volume")),
+ issue=clean(obj.get("issue")),
+ pages=clean(obj.get("page")),
+ language=clean(obj.get("language")),
license_slug=license_slug,
extra=extra,
abstracts=abstracts,
@@ -500,14 +576,17 @@ class CrossrefImporter(EntityImporter):
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
if existing:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a06c68a4..4c174b0b 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -30,126 +30,130 @@ MAX_ABSTRACT_LENGTH = 2048
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
CONTAINER_TYPE_MAP = {
- 'Journal': 'journal',
- 'Series': 'journal',
- 'Book Series': 'book-series',
+ "Journal": "journal",
+ "Series": "journal",
+ "Book Series": "book-series",
}
# The docs/guide should be the canonical home for these mappings; update there
# first. Map various datacite type types to CSL-ish types. None means TODO or
# remove.
DATACITE_TYPE_MAP = {
- 'ris': {
- 'THES': 'thesis',
- 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report)
- 'CHAP': 'chapter',
- 'FIGURE': 'figure',
- 'RPRT': 'report',
- 'JOUR': 'article-journal',
- 'MPCT': 'motion_picture',
- 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset
- 'BOOK': 'book',
- 'DATA': 'dataset',
- 'COMP': 'software',
+ "ris": {
+ "THES": "thesis",
+ "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report)
+ "CHAP": "chapter",
+ "FIGURE": "figure",
+ "RPRT": "report",
+ "JOUR": "article-journal",
+ "MPCT": "motion_picture",
+ "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset
+ "BOOK": "book",
+ "DATA": "dataset",
+ "COMP": "software",
},
- 'schemaOrg': {
- 'Dataset': 'dataset',
- 'Book': 'book',
- 'ScholarlyArticle': 'article-journal',
- 'ImageObject': 'graphic',
- 'Collection': None,
- 'MediaObject': None,
- 'Event': None,
- 'SoftwareSourceCode': 'software',
- 'Chapter': 'chapter',
- 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
- 'PublicationIssue': 'article',
- 'AudioObject': None,
- 'Thesis': 'thesis',
+ "schemaOrg": {
+ "Dataset": "dataset",
+ "Book": "book",
+ "ScholarlyArticle": "article-journal",
+ "ImageObject": "graphic",
+ "Collection": None,
+ "MediaObject": None,
+ "Event": None,
+ "SoftwareSourceCode": "software",
+ "Chapter": "chapter",
+ "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
+ "PublicationIssue": "article",
+ "AudioObject": None,
+ "Thesis": "thesis",
},
- 'citeproc': {
- 'article': 'article',
- 'article-journal': 'article-journal',
- 'article-magazine': 'article-magazine',
- 'article-newspaper': 'article-newspaper',
- 'bill': 'bill',
- 'book': 'book',
- 'broadcast': 'broadcast',
- 'chapter': 'chapter',
- 'dataset': 'dataset',
- 'entry-dictionary': 'entry-dictionary',
- 'entry-encyclopedia': 'entry-encyclopedia',
- 'entry': 'entry',
- 'figure': 'figure',
- 'graphic': 'graphic',
- 'interview': 'interview',
- 'legal_case': 'legal_case',
- 'legislation': 'legislation',
- 'manuscript': 'manuscript',
- 'map': 'map',
- 'motion_picture': 'motion_picture',
- 'musical_score': 'musical_score',
- 'pamphlet': 'pamphlet',
- 'paper-conference': 'paper-conference',
- 'patent': 'patent',
- 'personal_communication': 'personal_communication',
- 'post': 'post',
- 'post-weblog': 'post-weblog',
- 'report': 'report',
- 'review-book': 'review-book',
- 'review': 'review',
- 'song': 'song',
- 'speech': 'speech',
- 'thesis': 'thesis',
- 'treaty': 'treaty',
- 'webpage': 'webpage',
+ "citeproc": {
+ "article": "article",
+ "article-journal": "article-journal",
+ "article-magazine": "article-magazine",
+ "article-newspaper": "article-newspaper",
+ "bill": "bill",
+ "book": "book",
+ "broadcast": "broadcast",
+ "chapter": "chapter",
+ "dataset": "dataset",
+ "entry-dictionary": "entry-dictionary",
+ "entry-encyclopedia": "entry-encyclopedia",
+ "entry": "entry",
+ "figure": "figure",
+ "graphic": "graphic",
+ "interview": "interview",
+ "legal_case": "legal_case",
+ "legislation": "legislation",
+ "manuscript": "manuscript",
+ "map": "map",
+ "motion_picture": "motion_picture",
+ "musical_score": "musical_score",
+ "pamphlet": "pamphlet",
+ "paper-conference": "paper-conference",
+ "patent": "patent",
+ "personal_communication": "personal_communication",
+ "post": "post",
+ "post-weblog": "post-weblog",
+ "report": "report",
+ "review-book": "review-book",
+ "review": "review",
+ "song": "song",
+ "speech": "speech",
+ "thesis": "thesis",
+ "treaty": "treaty",
+ "webpage": "webpage",
}, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
- 'bibtex': {
- 'phdthesis': 'thesis',
- 'inbook': 'chapter',
- 'misc': None,
- 'article': 'article-journal',
- 'book': 'book',
+ "bibtex": {
+ "phdthesis": "thesis",
+ "inbook": "chapter",
+ "misc": None,
+ "article": "article-journal",
+ "book": "book",
},
- 'resourceTypeGeneral': {
- 'Image': 'graphic',
- 'Dataset': 'dataset',
- 'PhysicalObject': None,
- 'Collection': None,
- 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials"
- 'Sound': None,
- 'InteractiveResource': None,
- 'Event': None,
- 'Software': 'software',
- 'Other': None,
- 'Workflow': None,
- 'Audiovisual': None,
- } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
+ "resourceTypeGeneral": {
+ "Image": "graphic",
+ "Dataset": "dataset",
+ "PhysicalObject": None,
+ "Collection": None,
+ "Text": None, # "Greyliterature, labnotes, accompanyingmaterials"
+ "Sound": None,
+ "InteractiveResource": None,
+ "Event": None,
+ "Software": "software",
+ "Other": None,
+ "Workflow": None,
+ "Audiovisual": None,
+ }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
}
# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
DATACITE_UNKNOWN_MARKERS = (
- '(:unac)', # temporarily inaccessible
- '(:unal)', # unallowed, suppressed intentionally
- '(:unap)', # not applicable, makes no sense
- '(:unas)', # value unassigned (e.g., Untitled)
- '(:unav)', # value unavailable, possibly unknown
- '(:unkn)', # known to be unknown (e.g., Anonymous, Inconnue)
- '(:none)', # never had a value, never will
- '(:null)', # explicitly and meaningfully empty
- '(:tba)', # to be assigned or announced later
- '(:etal)', # too numerous to list (et alia)
+ "(:unac)", # temporarily inaccessible
+ "(:unal)", # unallowed, suppressed intentionally
+ "(:unap)", # not applicable, makes no sense
+ "(:unas)", # value unassigned (e.g., Untitled)
+ "(:unav)", # value unavailable, possibly unknown
+ "(:unkn)", # known to be unknown (e.g., Anonymous, Inconnue)
+ "(:none)", # never had a value, never will
+ "(:null)", # explicitly and meaningfully empty
+ "(:tba)", # to be assigned or announced later
+ "(:etal)", # too numerous to list (et alia)
)
# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking
# unknown values.
-UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
- 'NA',
- 'NN',
- 'n.a.',
- '[s.n.]',
- 'Unknown',
-)))
+UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(
+ set(
+ (
+ "NA",
+ "NN",
+ "n.a.",
+ "[s.n.]",
+ "Unknown",
+ )
+ )
+)
# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist.
UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
@@ -157,8 +161,20 @@ UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
# Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi
DATACITE_TITLE_SPAM_WORDGROUPS = [
{
- "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online',
- 'free', 'hd', 'download', 'english', 'subtitle', 'bluray'),
+ "tokens": (
+ "full",
+ "movies",
+ "movie",
+ "watch",
+ "streaming",
+ "online",
+ "free",
+ "hd",
+ "download",
+ "english",
+ "subtitle",
+ "bluray",
+ ),
"min": 4,
}
]
@@ -205,28 +221,25 @@ class DataciteImporter(EntityImporter):
"""
Importer for datacite records.
"""
- def __init__(self,
- api,
- issn_map_file,
- debug=False,
- insert_log_file=None,
- **kwargs):
+
+ def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs):
eg_desc = kwargs.get(
- 'editgroup_description',
- "Automated import of Datacite DOI metadata, harvested from REST API"
+ "editgroup_description",
+ "Automated import of Datacite DOI metadata, harvested from REST API",
)
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent',
- 'fatcat_tools.DataciteImporter')
- super().__init__(api,
- issn_map_file=issn_map_file,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
-
- self.create_containers = kwargs.get('create_containers', True)
- extid_map_file = kwargs.get('extid_map_file')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DataciteImporter")
+ super().__init__(
+ api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs
+ )
+
+ self.create_containers = kwargs.get("create_containers", True)
+ extid_map_file = kwargs.get("extid_map_file")
self.extid_map_db = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -240,30 +253,34 @@ class DataciteImporter(EntityImporter):
self.insert_log_file = insert_log_file
self.this_year = datetime.datetime.now().year
- print('datacite with debug={}'.format(self.debug), file=sys.stderr)
+ print("datacite with debug={}".format(self.debug), file=sys.stderr)
def lookup_ext_ids(self, doi):
"""
Return dictionary of identifiers referring to the same things as the given DOI.
"""
if self.extid_map_db is None:
- return dict(core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None)
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
- [doi.lower()]).fetchone()
+ "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+ ).fetchone()
if row is None:
- return dict(core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None)
- row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = [str(cell or "") or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
@@ -280,22 +297,22 @@ class DataciteImporter(EntityImporter):
"""
if not obj or not isinstance(obj, dict):
return None
- if 'attributes' not in obj:
+ if "attributes" not in obj:
return None
- attributes = obj['attributes']
- doi = clean_doi(attributes.get('doi', '').lower())
+ attributes = obj["attributes"]
+ doi = clean_doi(attributes.get("doi", "").lower())
if not doi:
- print('skipping record without a DOI', file=sys.stderr)
+ print("skipping record without a DOI", file=sys.stderr)
return
if not str.isascii(doi):
- print('[{}] skipping non-ascii doi for now'.format(doi))
+ print("[{}] skipping non-ascii doi for now".format(doi))
return None
- creators = attributes.get('creators', []) or []
- contributors = attributes.get('contributors', []) or [] # Much fewer than creators.
+ creators = attributes.get("creators", []) or []
+ contributors = attributes.get("contributors", []) or [] # Much fewer than creators.
contribs = self.parse_datacite_creators(creators, doi=doi)
@@ -323,7 +340,9 @@ class DataciteImporter(EntityImporter):
# Related: https://guide.fatcat.wiki/entity_release.html -- role
# (string, of a set): the type of contribution, from a controlled
# vocabulary. TODO: vocabulary needs review.
- contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi)
+ contribs_extra_contributors = self.parse_datacite_creators(
+ contributors, set_index=False, doi=doi
+ )
# Unfortunately, creators and contributors might overlap, refs GH59.
for cc in contribs_extra_contributors:
@@ -333,17 +352,16 @@ class DataciteImporter(EntityImporter):
# Title, may come with "attributes.titles[].titleType", like
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
- titles = attributes.get('titles', []) or []
- title, original_language_title, subtitle = parse_datacite_titles(
- titles)
+ titles = attributes.get("titles", []) or []
+ title, original_language_title, subtitle = parse_datacite_titles(titles)
if title is None:
- print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
+ print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
return False
title = clean(title)
if not title:
- print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
+ print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
return False
# check for blocklisted "spam", e.g. "FULL MOVIE"
@@ -367,10 +385,13 @@ class DataciteImporter(EntityImporter):
# "Collected", "Copyrighted", "Created", "Issued", "Submitted",
# "Updated", "Valid".
release_date, release_month, release_year = parse_datacite_dates(
- attributes.get('dates', []))
+ attributes.get("dates", [])
+ )
# block bogus far-future years/dates
- if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ if release_year is not None and (
+ release_year > (self.this_year + 5) or release_year < 1000
+ ):
release_date = None
release_month = None
release_year = None
@@ -378,26 +399,30 @@ class DataciteImporter(EntityImporter):
# Some records do not use the "dates" field (e.g. micropub), but:
# "attributes.published" or "attributes.publicationYear"
if not any((release_date, release_month, release_year)):
- release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear'))
+ release_date, release_month, release_year = parse_single_date(
+ attributes.get("publicationYear")
+ )
if not any((release_date, release_month, release_year)):
- release_date, release_month, release_year = parse_single_date(attributes.get('published'))
+ release_date, release_month, release_year = parse_single_date(
+ attributes.get("published")
+ )
if not any((release_date, release_month, release_year)):
- print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr)
+ print("[{}] record w/o date: {}".format(doi, obj), file=sys.stderr)
# Start with clear stages, e.g. published. TODO(martin): we could
# probably infer a bit more from the relations, e.g.
# "IsPreviousVersionOf" or "IsNewVersionOf".
- release_stage = 'published'
+ release_stage = "published"
# TODO(martin): If 'state' is not 'findable' or 'isActive' is not true,
# we might want something else than 'published'. See also:
# https://support.datacite.org/docs/doi-states.
# Publisher. A few NA values. A few bogus values.
- publisher = attributes.get('publisher')
+ publisher = attributes.get("publisher")
- if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')):
+ if publisher in UNKNOWN_MARKERS | set(("Unpublished", "Unknown")):
publisher = None
release_stage = None
if publisher is not None and len(publisher) > 80:
@@ -416,24 +441,26 @@ class DataciteImporter(EntityImporter):
container_id = None
container_name = None
- container = attributes.get('container', {}) or {}
- if container.get('type') in CONTAINER_TYPE_MAP.keys():
- container_type = CONTAINER_TYPE_MAP.get(container['type'])
- if container.get('identifier') and container.get(
- 'identifierType') == 'ISSN':
- issn = container.get('identifier')
+ container = attributes.get("container", {}) or {}
+ if container.get("type") in CONTAINER_TYPE_MAP.keys():
+ container_type = CONTAINER_TYPE_MAP.get(container["type"])
+ if container.get("identifier") and container.get("identifierType") == "ISSN":
+ issn = container.get("identifier")
if len(issn) == 8:
issn = issn[:4] + "-" + issn[4:]
issnl = self.issn2issnl(issn)
if issnl is not None:
container_id = self.lookup_issnl(issnl)
- if container_id is None and container.get('title'):
- container_name = container.get('title')
+ if container_id is None and container.get("title"):
+ container_name = container.get("title")
if isinstance(container_name, list):
if len(container_name) > 0:
- print('[{}] too many container titles: {}'.format(doi,
- len(container_name)))
+ print(
+ "[{}] too many container titles: {}".format(
+ doi, len(container_name)
+ )
+ )
container_name = container_name[0]
assert isinstance(container_name, str)
ce = fatcat_openapi_client.ContainerEntity(
@@ -447,21 +474,24 @@ class DataciteImporter(EntityImporter):
else:
# TODO(martin): factor this out into a testable function.
# TODO(martin): "container_name": "â„–1(1) (2018)" / 10.26087/inasan.2018.1.1.013
- container_name = container.get('title')
+ container_name = container.get("title")
if isinstance(container_name, list):
if len(container_name) > 0:
- print('[{}] too many container titles: {}'.format(doi,
- len(container_name)))
+ print(
+ "[{}] too many container titles: {}".format(
+ doi, len(container_name)
+ )
+ )
container_name = container_name[0]
# Exception: https://www.micropublication.org/, see: !MR24.
if container_id is None and container_name is None:
- if publisher and publisher.lower().startswith('micropublication'):
+ if publisher and publisher.lower().startswith("micropublication"):
container_name = publisher
# Volume and issue.
- volume = container.get('volume')
- issue = container.get('issue')
+ volume = container.get("volume")
+ issue = container.get("issue")
if volume:
volume = clean(volume)
@@ -472,13 +502,13 @@ class DataciteImporter(EntityImporter):
# Pages.
pages = None
- first_page = container.get('firstPage')
- last_page = container.get('lastPage')
+ first_page = container.get("firstPage")
+ last_page = container.get("lastPage")
if first_page and last_page:
try:
_ = int(first_page) < int(last_page)
- pages = '{}-{}'.format(first_page, last_page)
+ pages = "{}-{}".format(first_page, last_page)
except ValueError as err: # noqa: F841
# TODO(martin): This is more debug than info.
# print('[{}] {}'.format(doi, err), file=sys.stderr)
@@ -491,8 +521,8 @@ class DataciteImporter(EntityImporter):
license_slug = None
license_extra = []
- for lic in attributes.get('rightsList', []):
- slug = lookup_license_slug(lic.get('rightsUri'))
+ for lic in attributes.get("rightsList", []):
+ slug = lookup_license_slug(lic.get("rightsUri"))
if slug:
license_slug = slug
license_extra.append(lic)
@@ -506,7 +536,7 @@ class DataciteImporter(EntityImporter):
# library solves it for you." -- TODO(martin): We need more of these.
language = None
- value = attributes.get('language', '') or ''
+ value = attributes.get("language", "") or ""
try:
language = pycountry.languages.lookup(value).alpha_2
except (LookupError, AttributeError) as err: # noqa: F841
@@ -520,22 +550,22 @@ class DataciteImporter(EntityImporter):
# "Other" fields might contain references or related articles (with
# DOI). TODO(martin): maybe try to parse out some of those refs.
abstracts = []
- descs = attributes.get('descriptions', []) or []
+ descs = attributes.get("descriptions", []) or []
for desc in descs:
- if not desc.get('descriptionType') == 'Abstract':
+ if not desc.get("descriptionType") == "Abstract":
continue
# Description maybe a string, int or list.
- text = desc.get('description', '')
+ text = desc.get("description", "")
if not text:
continue
if isinstance(text, int):
- text = '{}'.format(text)
+ text = "{}".format(text)
if isinstance(text, list):
try:
text = "\n".join(text)
except TypeError:
- continue # Bail out, if it is not a list of strings.
+ continue # Bail out, if it is not a list of strings.
# Limit length.
if len(text) < 10:
@@ -548,7 +578,10 @@ class DataciteImporter(EntityImporter):
try:
lang = langdetect.detect(text)
except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err:
- print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr)
+ print(
+ "[{}] language detection failed with {} on {}".format(doi, err, text),
+ file=sys.stderr,
+ )
abstract_text = clean(text)
if not abstract_text:
continue
@@ -557,7 +590,8 @@ class DataciteImporter(EntityImporter):
mimetype="text/plain",
content=abstract_text,
lang=lang,
- ))
+ )
+ )
# References and relations. Datacite include many relation types in
# "attributes.relatedIdentifiers[].relationType", e.g.
@@ -570,67 +604,76 @@ class DataciteImporter(EntityImporter):
# For the moment, we only care about References.
refs, ref_index = [], 0
- relIds = attributes.get('relatedIdentifiers', []) or []
+ relIds = attributes.get("relatedIdentifiers", []) or []
for rel in relIds:
- if not rel.get('relationType', '') in ('References', 'Cites'):
+ if not rel.get("relationType", "") in ("References", "Cites"):
continue
ref_extra = dict()
- if rel.get('relatedIdentifierType', '') == 'DOI':
- ref_extra['doi'] = rel.get('relatedIdentifier')
+ if rel.get("relatedIdentifierType", "") == "DOI":
+ ref_extra["doi"] = rel.get("relatedIdentifier")
if not ref_extra:
ref_extra = None
refs.append(
fatcat_openapi_client.ReleaseRef(
index=ref_index,
extra=ref_extra,
- ))
+ )
+ )
ref_index += 1
# More specific release_type via 'Reviews' relationsship.
for rel in relIds:
- if rel.get('relatedIdentifierType', '') != 'Reviews':
+ if rel.get("relatedIdentifierType", "") != "Reviews":
continue
- release_type = 'review'
+ release_type = "review"
# Extra information.
extra_datacite = dict()
if license_extra:
- extra_datacite['license'] = license_extra
- if attributes.get('subjects'):
- extra_datacite['subjects'] = attributes['subjects']
+ extra_datacite["license"] = license_extra
+ if attributes.get("subjects"):
+ extra_datacite["subjects"] = attributes["subjects"]
# Include version information.
- metadata_version = attributes.get('metadataVersion') or ''
+ metadata_version = attributes.get("metadataVersion") or ""
if metadata_version:
- extra_datacite['metadataVersion'] = metadata_version
+ extra_datacite["metadataVersion"] = metadata_version
# Include resource types.
- types = attributes.get('types', {}) or {}
- resource_type = types.get('resourceType', '') or ''
- resource_type_general = types.get('resourceTypeGeneral', '') or ''
+ types = attributes.get("types", {}) or {}
+ resource_type = types.get("resourceType", "") or ""
+ resource_type_general = types.get("resourceTypeGeneral", "") or ""
if resource_type and resource_type.lower() not in UNKNOWN_MARKERS_LOWER:
- extra_datacite['resourceType'] = resource_type
+ extra_datacite["resourceType"] = resource_type
if resource_type_general and resource_type_general.lower() not in UNKNOWN_MARKERS_LOWER:
- extra_datacite['resourceTypeGeneral'] = resource_type_general
+ extra_datacite["resourceTypeGeneral"] = resource_type_general
# Include certain relations from relatedIdentifiers. Keeping the
# original structure of data here, which is a list of dicts, with
# relation type, identifier and identifier type (mostly).
relations = []
for rel in relIds:
- if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
- 'IsVariantFormOf', 'IsSupplementTo',
- 'HasVersion', 'IsMetadataFor',
- 'IsNewVersionOf', 'IsIdenticalTo',
- 'IsVersionOf', 'IsDerivedFrom',
- 'IsSourceOf'):
+ if rel.get("relationType") in (
+ "IsPartOf",
+ "Reviews",
+ "Continues",
+ "IsVariantFormOf",
+ "IsSupplementTo",
+ "HasVersion",
+ "IsMetadataFor",
+ "IsNewVersionOf",
+ "IsIdenticalTo",
+ "IsVersionOf",
+ "IsDerivedFrom",
+ "IsSourceOf",
+ ):
relations.append(rel)
if relations:
- extra_datacite['relations'] = relations
+ extra_datacite["relations"] = relations
extra = dict()
@@ -640,18 +683,18 @@ class DataciteImporter(EntityImporter):
# Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null,
# "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0",
# "10161", "10010691", "10780", # "Presentación"
- version = attributes.get('version') or None
+ version = attributes.get("version") or None
# top-level extra keys
if not container_id and container_name:
- extra['container_name'] = container_name
+ extra["container_name"] = container_name
# Always include datacite key, even if value is empty (dict).
- extra['datacite'] = extra_datacite
+ extra["datacite"] = extra_datacite
# Preparation for a schema update.
if release_month:
- extra['release_month'] = release_month
+ extra["release_month"] = release_month
extids = self.lookup_ext_ids(doi=doi)
@@ -669,12 +712,12 @@ class DataciteImporter(EntityImporter):
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids['pmid'],
- pmcid=extids['pmcid'],
- wikidata_qid=extids['wikidata_qid'],
- core=extids['core_id'],
- arxiv=extids['arxiv_id'],
- jstor=extids['jstor_id'],
+ pmid=extids["pmid"],
+ pmcid=extids["pmcid"],
+ wikidata_qid=extids["wikidata_qid"],
+ core=extids["core_id"],
+ arxiv=extids["arxiv_id"],
+ jstor=extids["jstor_id"],
),
contribs=contribs,
volume=volume,
@@ -702,19 +745,19 @@ class DataciteImporter(EntityImporter):
"""
release_type = None
- if not attributes.get('types'):
+ if not attributes.get("types"):
return None
- types = attributes['types']
+ types = attributes["types"]
- for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
+ for typeType in ("citeproc", "ris", "schemaOrg", "bibtex", "resourceTypeGeneral"):
value = types.get(typeType)
release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
if release_type is not None:
break
# special case: figshare "collections" which group other entities
- if doi.startswith('10.6084/') or doi.startswith('10.25384'):
- if types.get('resourceType') == "Collection":
+ if doi.startswith("10.6084/") or doi.startswith("10.25384"):
+ if types.get("resourceType") == "Collection":
release_type = "stub"
if release_type is None:
@@ -736,35 +779,41 @@ class DataciteImporter(EntityImporter):
# publishes highly interesting datasets, but titles are mostly the same
# ("GBIF Occurrence Download" or "Occurrence Download"); set
# release_type to "stub" (CSL/FC).
- if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'):
- re.release_type = 'stub'
+ if re.title == "GBIF Occurrence Download" and re.ext_ids.doi.startswith("10.15468/dl."):
+ re.release_type = "stub"
# release_type exception: lots of "Experimental Crystal Structure Determination"
# publisher: "Cambridge Crystallographic Data Centre"
- if re.ext_ids.doi.startswith('10.5517/'):
- re.release_type = 'entry'
+ if re.ext_ids.doi.startswith("10.5517/"):
+ re.release_type = "entry"
# Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
- if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'):
- re.release_type = 'component'
+ if re.title.lower().startswith("additional file") and re.release_type in (
+ "article",
+ "article-journal",
+ ):
+ re.release_type = "component"
# figshare
- if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'):
+ if re.ext_ids.doi.startswith("10.6084/") or re.ext_ids.doi.startswith("10.25384"):
# set version if DOI ends with versioned suffix
- doi_suffix = re.ext_ids.doi.split('.')[-1]
- if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit():
+ doi_suffix = re.ext_ids.doi.split(".")[-1]
+ if doi_suffix and doi_suffix.startswith("v") and doi_suffix[1:].isdigit():
re.version = doi_suffix
# "Figure 123 from " -> component
# "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean"
- if " from " in re.title and re.release_type not in ('stub', 'graphic'):
+ if " from " in re.title and re.release_type not in ("stub", "graphic"):
if re.title.startswith("Figure "):
re.release_type = "component"
elif re.title.startswith("Table "):
re.release_type = "component"
# figshare.com
- if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None:
- re.extra['container_name'] = "figshare.com"
+ if (
+ re.ext_ids.doi.startswith("10.6084/m9.figshare.")
+ and re.extra.get("container_name") is None
+ ):
+ re.extra["container_name"] = "figshare.com"
return re
@@ -788,26 +837,28 @@ class DataciteImporter(EntityImporter):
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
if existing:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
return True
def insert_batch(self, batch):
- print('inserting batch ({})'.format(len(batch)), file=sys.stderr)
+ print("inserting batch ({})".format(len(batch)), file=sys.stderr)
if self.insert_log_file:
- with open(self.insert_log_file, 'a') as f:
+ with open(self.insert_log_file, "a") as f:
for doc in batch:
json.dump(entity_to_dict(doc, api_client=None), f)
- f.write('\n')
+ f.write("\n")
self.api.create_release_auto_batch(
fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
- def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None):
+ def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None):
"""
Parses a list of creators into a list of ReleaseContrib objects. Set
set_index to False, if the index contrib field should be left blank.
@@ -820,48 +871,53 @@ class DataciteImporter(EntityImporter):
contribs = []
# Names, that should be ignored right away.
- name_blocklist = set(('Occdownload Gbif.Org',))
+ name_blocklist = set(("Occdownload Gbif.Org",))
i = 0
for c in creators:
if not set_index:
i = None
- nameType = c.get('nameType', '') or ''
- if nameType in ('', 'Personal'):
+ nameType = c.get("nameType", "") or ""
+ if nameType in ("", "Personal"):
creator_id = None
- for nid in c.get('nameIdentifiers', []) or []:
+ for nid in c.get("nameIdentifiers", []) or []:
if not isinstance(nid, dict):
# see: fatcat-workers/issues/44035/
- print('unexpected nameIdentifiers, expected list of dicts, got: {}'.format(nid), file=sys.stderr)
+ print(
+ "unexpected nameIdentifiers, expected list of dicts, got: {}".format(
+ nid
+ ),
+ file=sys.stderr,
+ )
continue
- name_scheme = nid.get('nameIdentifierScheme', '') or ''
+ name_scheme = nid.get("nameIdentifierScheme", "") or ""
if not name_scheme.lower() == "orcid":
continue
- orcid = nid.get('nameIdentifier') or ''
- orcid = orcid.replace('https://orcid.org/', '')
+ orcid = nid.get("nameIdentifier") or ""
+ orcid = orcid.replace("https://orcid.org/", "")
if not orcid:
continue
creator_id = self.lookup_orcid(orcid)
# TODO(martin): If creator_id is None, should we create creators?
# If there are multiple affiliation strings, use the first one.
- affiliations = c.get('affiliation', []) or []
+ affiliations = c.get("affiliation", []) or []
raw_affiliation = None
if len(affiliations) == 0:
raw_affiliation = None
else:
raw_affiliation = clean(affiliations[0])
- name = c.get('name')
- given_name = c.get('givenName')
- surname = c.get('familyName')
+ name = c.get("name")
+ given_name = c.get("givenName")
+ surname = c.get("familyName")
if name:
name = clean(name)
if not any((name, given_name, surname)):
continue
if not name:
- name = "{} {}".format(given_name or '', surname or '').strip()
+ name = "{} {}".format(given_name or "", surname or "").strip()
if name in name_blocklist:
continue
if name.lower() in UNKNOWN_MARKERS_LOWER:
@@ -881,7 +937,7 @@ class DataciteImporter(EntityImporter):
if not name:
continue
- if raw_affiliation == '':
+ if raw_affiliation == "":
continue
extra = None
@@ -891,39 +947,38 @@ class DataciteImporter(EntityImporter):
# "RelatedPerson", "ProjectLeader", "Editor", "Other",
# "ProjectMember", "Funder", "RightsHolder", "DataCollector",
# "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
- contributorType = c.get('contributorType', '') or ''
+ contributorType = c.get("contributorType", "") or ""
if contributorType:
- extra = {'type': contributorType}
+ extra = {"type": contributorType}
rc = fatcat_openapi_client.ReleaseContrib(
- creator_id=creator_id,
- index=i,
- raw_name=name,
- given_name=given_name,
- surname=surname,
- role=role,
- raw_affiliation=raw_affiliation,
- extra=extra,
- )
+ creator_id=creator_id,
+ index=i,
+ raw_name=name,
+ given_name=given_name,
+ surname=surname,
+ role=role,
+ raw_affiliation=raw_affiliation,
+ extra=extra,
+ )
# Filter out duplicates early.
if not contributor_list_contains_contributor(contribs, rc):
contribs.append(rc)
if i is not None:
i += 1
- elif nameType == 'Organizational':
- name = c.get('name', '') or ''
+ elif nameType == "Organizational":
+ name = c.get("name", "") or ""
if name in UNKNOWN_MARKERS:
continue
if len(name) < 3:
continue
- extra = {'organization': name}
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- index=i, extra=extra))
+ extra = {"organization": name}
+ contribs.append(fatcat_openapi_client.ReleaseContrib(index=i, extra=extra))
if i is not None:
i += 1
else:
- print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+ print("[{}] unknown name type: {}".format(doi, nameType), file=sys.stderr)
return contribs
@@ -935,8 +990,8 @@ def contributor_list_contains_contributor(contributor_list, contributor):
for cc in contributor_list:
if cc.raw_name != contributor.raw_name:
continue
- cc_role = cc.role or 'author'
- contributor_role = contributor.role or 'author'
+ cc_role = cc.role or "author"
+ contributor_role = contributor.role or "author"
if cc_role != contributor_role:
continue
return True
@@ -952,91 +1007,97 @@ def lookup_license_slug(raw):
if not raw:
return None
- if 'creativecommons.org/publicdomain/zero' in raw:
- return 'CC-0'
- if raw.lower().endswith('/cc0'):
- return 'CC-0'
+ if "creativecommons.org/publicdomain/zero" in raw:
+ return "CC-0"
+ if raw.lower().endswith("/cc0"):
+ return "CC-0"
- if 'creativecommons' in raw:
+ if "creativecommons" in raw:
# https://creativecommons.org/publicdomain/mark/1.0/deed.de
- if 'creativecommons.org/publicdomain' in raw:
- return 'CC-PUBLICDOMAIN'
- if 'creativecommons.org/share-your-work/public-domain/cc0' in raw:
- return 'CC-0'
+ if "creativecommons.org/publicdomain" in raw:
+ return "CC-PUBLICDOMAIN"
+ if "creativecommons.org/share-your-work/public-domain/cc0" in raw:
+ return "CC-0"
# https://creativecommons.org/licenses/by/4.0/deed.es_ES
raw = raw.lower()
- match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw, re.IGNORECASE)
+ match = re.search(
+ r"creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)", raw, re.IGNORECASE
+ )
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
- if not name.startswith('cc'):
- name = 'cc-{}'.format(name)
+ if not name.startswith("cc"):
+ name = "cc-{}".format(name)
return name.upper()
- if 'opensource.org' in raw:
+ if "opensource.org" in raw:
# https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2
- match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw, re.IGNORECASE)
+ match = re.search(r"opensource.org/licenses/(?P<name>[^/]+)", raw, re.IGNORECASE)
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
if len(name) > 11:
return None
return name.upper()
- if 'gnu.org' in raw:
+ if "gnu.org" in raw:
# http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
- match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE)
+ match = re.search(
+ r"/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)",
+ raw,
+ re.IGNORECASE,
+ )
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
if len(name) > 8:
return None
return name.upper()
- if 'spdx.org' in raw:
- if 'spdx.org/licenses/CC0' in raw:
- return 'CC-0'
+ if "spdx.org" in raw:
+ if "spdx.org/licenses/CC0" in raw:
+ return "CC-0"
# https://spdx.org/licenses/CC-BY-NC-ND-4.0.html
- match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw, re.IGNORECASE)
+ match = re.search(r"spdx.org/licenses/(?P<name>[a-z0-9-]+)", raw, re.IGNORECASE)
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
if len(name) > 36:
return None
# cleanup version and extensions
- name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower())
+ name = re.sub("(-[0-9])?[.]?[0-9]?(.json|.html)?", "", name.lower())
return name.upper()
- if 'rightsstatements.org' in raw:
+ if "rightsstatements.org" in raw:
# http://rightsstatements.org/vocab/InC/1.0/
- match = re.search(r'rightsstatements.org/(vocab|page)/(?P<name>[^/]*)', raw)
+ match = re.search(r"rightsstatements.org/(vocab|page)/(?P<name>[^/]*)", raw)
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
if len(name) > 9:
return None
- return 'RS-{}'.format(name.upper())
+ return "RS-{}".format(name.upper())
# Fallback to mapped values.
raw = raw.lower()
- raw = raw.strip().replace('http://', '//').replace('https://', '//')
- if not raw.endswith('/'):
- raw = raw + '/'
+ raw = raw.strip().replace("http://", "//").replace("https://", "//")
+ if not raw.endswith("/"):
+ raw = raw + "/"
return LICENSE_SLUG_MAP.get(raw)
@@ -1046,23 +1107,21 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3):
Example input: {'title': 'Some title', 'original_language_title': 'Some title'}
"""
- if 'original_language_title' not in item:
+ if "original_language_title" not in item:
return None
- title = item.get('title')
+ title = item.get("title")
if not title:
return None
- original_language_title = item.get('original_language_title')
- if isinstance(original_language_title,
- str) and title != original_language_title:
+ original_language_title = item.get("original_language_title")
+ if isinstance(original_language_title, str) and title != original_language_title:
if len(original_language_title) < min_length:
return None
- if original_language_title.count('?') > max_questionmarks:
+ if original_language_title.count("?") > max_questionmarks:
return None
return original_language_title
if isinstance(original_language_title, dict):
- content = original_language_title.get('__content__', '') or ''
- if content and content != title and not content.count(
- '?') > max_questionmarks:
+ content = original_language_title.get("__content__", "") or ""
+ if content and content != title and not content.count("?") > max_questionmarks:
return content
return None
@@ -1082,23 +1141,23 @@ def parse_datacite_titles(titles):
return title, original_language_title, subtitle
elif len(titles) == 1:
original_language_title = find_original_language_title(titles[0])
- title = titles[0].get('title', '') or ''
+ title = titles[0].get("title", "") or ""
title = title.strip()
if not title:
title = None
return title, original_language_title, subtitle
else:
for entry in titles:
- if not title and ('titleType' not in entry
- or not entry.get('titleType')):
- title = (entry.get('title') or '').strip()
- if not subtitle and entry.get('titleType') == 'Subtitle':
- subtitle = entry.get('title', '').strip()
+ if not title and ("titleType" not in entry or not entry.get("titleType")):
+ title = (entry.get("title") or "").strip()
+ if not subtitle and entry.get("titleType") == "Subtitle":
+ subtitle = entry.get("title", "").strip()
if not original_language_title:
original_language_title = find_original_language_title(entry)
return title, original_language_title, subtitle
+
def parse_single_date(value):
"""
Given a single string containing a date in arbitrary format, try to return
@@ -1113,11 +1172,11 @@ def parse_single_date(value):
# Results in a dict with keys: date_obj, period, locale.
parse_result = parser.get_date_data(value)
# A datetime object, later we need a date, only.
- result = parse_result['date_obj']
+ result = parse_result["date_obj"]
if result is not None:
- if parse_result['period'] == 'year':
+ if parse_result["period"] == "year":
return None, None, result.year
- elif parse_result['period'] == 'month':
+ elif parse_result["period"] == "month":
return None, result.month, result.year
else:
return result.date(), result.month, result.year
@@ -1126,6 +1185,7 @@ def parse_single_date(value):
return None, None, None
+
def parse_datacite_dates(dates):
"""
Given a list of date fields (under .dates), return tuple, (release_date,
@@ -1137,37 +1197,37 @@ def parse_datacite_dates(dates):
return release_date, release_month, release_year
if not isinstance(dates, list):
- raise ValueError('expected a list of date items')
+ raise ValueError("expected a list of date items")
# Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted",
# "Collected", "Updated", "Copyrighted", "Created"
# Ignored for now: "Collected", "Issued"
date_type_prio = (
- 'Valid',
- 'Available',
- 'Accepted',
- 'Submitted',
- 'Copyrighted',
- 'Created',
- 'Updated',
+ "Valid",
+ "Available",
+ "Accepted",
+ "Submitted",
+ "Copyrighted",
+ "Created",
+ "Updated",
)
# We need to note the granularity, since a string like "2019" would be
# parsed into "2019-01-01", even though the month is unknown. Use 3
# granularity types: 'y', 'm', 'd'.
- Pattern = collections.namedtuple('Pattern', 'layout granularity')
+ Pattern = collections.namedtuple("Pattern", "layout granularity")
# Before using (expensive) dateparser, try a few common patterns.
common_patterns = (
- Pattern('%Y-%m-%d', 'd'),
- Pattern('%Y-%m', 'm'),
- Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'),
- Pattern('%Y-%m-%dT%H:%M:%S', 'd'),
- Pattern('%Y', 'y'),
+ Pattern("%Y-%m-%d", "d"),
+ Pattern("%Y-%m", "m"),
+ Pattern("%Y-%m-%dT%H:%M:%SZ", "d"),
+ Pattern("%Y-%m-%dT%H:%M:%S", "d"),
+ Pattern("%Y", "y"),
)
def parse_item(item):
- result, value, year_only = None, str(item.get('date', '')) or '', False
+ result, value, year_only = None, str(item.get("date", "")) or "", False
release_date, release_month, release_year = None, None, None
for layout, granularity in common_patterns:
@@ -1176,22 +1236,22 @@ def parse_datacite_dates(dates):
except ValueError:
continue
else:
- if granularity == 'y':
+ if granularity == "y":
year_only = True
break
if result is None:
- print('fallback for {}'.format(value), file=sys.stderr)
+ print("fallback for {}".format(value), file=sys.stderr)
release_date, release_month, release_year = parse_single_date(value)
if result is None:
# Unparsable date.
return release_date, release_month, release_year
- if granularity != 'y':
+ if granularity != "y":
release_date = result.date()
release_year = result.year
- if granularity in ('m', 'd'):
+ if granularity in ("m", "d"):
release_month = result.month
return release_date, release_month, release_year
@@ -1200,7 +1260,7 @@ def parse_datacite_dates(dates):
for prio in date_type_prio:
for item in dates:
- if not item.get('dateType') == prio:
+ if not item.get("dateType") == prio:
continue
release_date, release_month, release_year = parse_item(item)
@@ -1224,45 +1284,49 @@ def parse_datacite_dates(dates):
return release_date, release_month, release_year
+
def index_form_to_display_name(s):
"""
Try to convert an index form name, like 'Razis, Panos A' into display_name,
e.g. 'Panos A Razis'.
"""
- if ',' not in s:
+ if "," not in s:
return s
- skip_on_chars = ['(', ')', '*']
+ skip_on_chars = ["(", ")", "*"]
for char in skip_on_chars:
if char in s:
return s
- if s.count(',') > 1:
+ if s.count(",") > 1:
# "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
return s
# Not names, but sprinkled in fields where authors live.
- stopwords = [s.lower() for s in (
- 'Archive',
- 'Collection',
- 'Coordinator',
- 'Department',
- 'Germany',
- 'International',
- 'National',
- 'Netherlands',
- 'Office',
- 'Organisation',
- 'Organization',
- 'Service',
- 'Services',
- 'United States',
- 'University',
- 'Verein',
- 'Volkshochschule',
- )]
+ stopwords = [
+ s.lower()
+ for s in (
+ "Archive",
+ "Collection",
+ "Coordinator",
+ "Department",
+ "Germany",
+ "International",
+ "National",
+ "Netherlands",
+ "Office",
+ "Organisation",
+ "Organization",
+ "Service",
+ "Services",
+ "United States",
+ "University",
+ "Verein",
+ "Volkshochschule",
+ )
+ ]
lower = s.lower()
for stop in stopwords:
if stop in lower:
return s
- a, b = s.split(',')
- return '{} {}'.format(b.strip(), a.strip())
+ a, b = s.split(",")
+ return "{} {}".format(b.strip(), a.strip())
diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py
index 3d280fb7..603a6271 100644
--- a/python/fatcat_tools/importers/dblp_container.py
+++ b/python/fatcat_tools/importers/dblp_container.py
@@ -1,4 +1,3 @@
-
"""
Importer for DBLP container-level (journal/conference/series) metadata,
pre-scraped in to JSON from HTML pages.
@@ -13,17 +12,17 @@ from fatcat_tools.normal import clean_str
class DblpContainerImporter(EntityImporter):
+ def __init__(
+ self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs
+ ):
- def __init__(self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs):
-
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of container-level metadata scraped from dblp HTML")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DblpContainerImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of container-level metadata scraped from dblp HTML",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpContainerImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.dblp_container_map_output = dblp_container_map_output
self.read_dblp_container_map_file(dblp_container_map_file)
@@ -40,7 +39,10 @@ class DblpContainerImporter(EntityImporter):
assert len(container_id) == 26
self._dblp_container_map[prefix] = container_id
print("\t".join([prefix, container_id]), file=self.dblp_container_map_output)
- print("Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
+ print(
+ "Got {} existing dblp container mappings.".format(len(self._dblp_container_map)),
+ file=sys.stderr,
+ )
def lookup_dblp_prefix(self, prefix):
if not prefix:
@@ -57,48 +59,48 @@ class DblpContainerImporter(EntityImporter):
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
- dblp_prefix = row.get('key') or row.get('dblp_prefix')
+ dblp_prefix = row.get("key") or row.get("dblp_prefix")
assert dblp_prefix
- assert row['title']
+ assert row["title"]
container_type = None
- if dblp_prefix.startswith('conf/'):
+ if dblp_prefix.startswith("conf/"):
container_type = "conference-series"
- elif dblp_prefix.startswith('journals/'):
+ elif dblp_prefix.startswith("journals/"):
container_type = "journal"
- elif dblp_prefix.startswith('series/'):
+ elif dblp_prefix.startswith("series/"):
container_type = "book-series"
issnl = None
- for issn in row.get('issns', []):
+ for issn in row.get("issns", []):
issnl = self.issn2issnl(issn)
if issnl:
break
extra = {
- 'dblp': {
- 'prefix': dblp_prefix,
+ "dblp": {
+ "prefix": dblp_prefix,
},
}
- if row.get('homepage_url'):
- extra['urls'] = [row['homepage_url']]
+ if row.get("homepage_url"):
+ extra["urls"] = [row["homepage_url"]]
- if row.get('acronym'):
- extra['acronym'] = row['acronym']
+ if row.get("acronym"):
+ extra["acronym"] = row["acronym"]
ce = fatcat_openapi_client.ContainerEntity(
- name=clean_str(row['title']),
+ name=clean_str(row["title"]),
container_type=container_type,
issnl=issnl,
- wikidata_qid=row.get('wikidata_qid'),
+ wikidata_qid=row.get("wikidata_qid"),
extra=extra,
)
return ce
def try_update(self, ce):
- dblp_prefix = ce.extra['dblp']['prefix']
+ dblp_prefix = ce.extra["dblp"]["prefix"]
existing = None
existing_container_id = self.lookup_dblp_prefix(dblp_prefix)
if existing_container_id:
@@ -123,8 +125,11 @@ class DblpContainerImporter(EntityImporter):
return True
if existing:
- self.counts['exists'] += 1
- print("\t".join([ce.extra['dblp']['prefix'], existing.ident]), file=self.dblp_container_map_output)
+ self.counts["exists"] += 1
+ print(
+ "\t".join([ce.extra["dblp"]["prefix"], existing.ident]),
+ file=self.dblp_container_map_output,
+ )
return False
# shouldn't get here
@@ -135,11 +140,17 @@ class DblpContainerImporter(EntityImporter):
Because we want to print a prefix/container_id match for each row, we
require a special batch insert method
"""
- eg = self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ eg = self.api.create_container_auto_batch(
+ fatcat_openapi_client.ContainerAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
for c_edit in eg.edits.containers:
c = self.api.get_container(c_edit.ident)
- print("\t".join([c.extra['dblp']['prefix'], c.ident]), file=self.dblp_container_map_output)
+ print(
+ "\t".join([c.extra["dblp"]["prefix"], c.ident]),
+ file=self.dblp_container_map_output,
+ )
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 6d028f2f..5baa6cd6 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -1,4 +1,3 @@
-
"""
Importer for DBLP release-level (article/paper/etc) XML metadata.
@@ -44,25 +43,16 @@ from fatcat_tools.transforms import entity_to_dict
class DblpReleaseImporter(EntityImporter):
-
- def __init__(self,
- api,
- dblp_container_map_file=None,
- **kwargs):
+ def __init__(self, api, dblp_container_map_file=None, **kwargs):
eg_desc = kwargs.get(
- 'editgroup_description',
- "Automated import of dblp metadata via XML records"
+ "editgroup_description", "Automated import of dblp metadata via XML records"
)
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent',
- 'fatcat_tools.DblpReleaseImporter')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpReleaseImporter")
# ensure default is to not do updates with this worker (override super() default)
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.dump_json_mode = kwargs.get("dump_json_mode", False)
self.this_year = datetime.datetime.now().year
@@ -76,13 +66,16 @@ class DblpReleaseImporter(EntityImporter):
"phdthesis",
"mastersthesis",
"www",
- #"data", # no instances in 2020-11 dump
+ # "data", # no instances in 2020-11 dump
]
def read_dblp_container_map_file(self, dblp_container_map_file) -> None:
self._dblp_container_map = dict()
if not dblp_container_map_file:
- print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr)
+ print(
+ "Not loading a dblp prefix container map file; entities will fail to import",
+ file=sys.stderr,
+ )
return
print("Loading dblp prefix container map file...", file=sys.stderr)
for line in dblp_container_map_file:
@@ -92,7 +85,10 @@ class DblpReleaseImporter(EntityImporter):
container_id = container_id.strip()
assert len(container_id) == 26
self._dblp_container_map[prefix] = container_id
- print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
+ print(
+ "Got {} dblp container mappings.".format(len(self._dblp_container_map)),
+ file=sys.stderr,
+ )
def lookup_dblp_prefix(self, prefix):
if not prefix:
@@ -101,13 +97,13 @@ class DblpReleaseImporter(EntityImporter):
def want(self, xml_elem):
if xml_elem.name not in self.ELEMENT_TYPES:
- self.counts['skip-type'] += 1
+ self.counts["skip-type"] += 1
return False
- if not xml_elem.get('key'):
- self.counts['skip-no-key'] += 1
+ if not xml_elem.get("key"):
+ self.counts["skip-no-key"] += 1
return False
- if xml_elem['key'].startswith('homepage/'):
- self.counts['skip-type-homepage'] += 1
+ if xml_elem["key"].startswith("homepage/"):
+ self.counts["skip-type-homepage"] += 1
return False
return True
@@ -127,88 +123,88 @@ class DblpReleaseImporter(EntityImporter):
- isbn
"""
- dblp_key = xml_elem.get('key')
+ dblp_key = xml_elem.get("key")
if not dblp_key:
- self.counts['skip-empty-key'] += 1
+ self.counts["skip-empty-key"] += 1
return False
- dblp_key_type = dblp_key.split('/')[0]
+ dblp_key_type = dblp_key.split("/")[0]
# dblp_prefix may be used for container lookup
dblp_prefix = None
- if dblp_key_type in ('journals', 'conf'):
- dblp_prefix = '/'.join(dblp_key.split('/')[:2])
- elif dblp_key_type in ('series', 'reference', 'tr', 'books'):
- dblp_prefix = '/'.join(dblp_key.split('/')[:-1])
+ if dblp_key_type in ("journals", "conf"):
+ dblp_prefix = "/".join(dblp_key.split("/")[:2])
+ elif dblp_key_type in ("series", "reference", "tr", "books"):
+ dblp_prefix = "/".join(dblp_key.split("/")[:-1])
- publtype = xml_elem.get('publtype') or None
+ publtype = xml_elem.get("publtype") or None
dblp_type = xml_elem.name
if dblp_type not in self.ELEMENT_TYPES:
- self.counts[f'skip-dblp-type:{dblp_type}'] += 1
+ self.counts[f"skip-dblp-type:{dblp_type}"] += 1
- if dblp_key_type in ('homepages', 'persons', 'dblpnote'):
- self.counts['skip-key-type'] += 1
+ if dblp_key_type in ("homepages", "persons", "dblpnote"):
+ self.counts["skip-key-type"] += 1
return False
- if dblp_key.startswith('journals/corr/'):
- self.counts['skip-arxiv-corr'] += 1
+ if dblp_key.startswith("journals/corr/"):
+ self.counts["skip-arxiv-corr"] += 1
return False
title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True)
if not title:
- self.counts['skip-title'] += 1
+ self.counts["skip-title"] += 1
return False
- if title.endswith('.'):
+ if title.endswith("."):
title = title[:-1]
release_type = None
- release_stage = 'published'
+ release_stage = "published"
withdrawn_status = None
# primary releae_type detection: type of XML element, then prefix of key for granularity
- if dblp_type == 'article':
- release_type = 'article'
- if dblp_key_type == 'journals' and publtype != 'informal':
- release_type = 'article-journal'
- elif dblp_key_type == 'tr':
- release_type = 'report'
+ if dblp_type == "article":
+ release_type = "article"
+ if dblp_key_type == "journals" and publtype != "informal":
+ release_type = "article-journal"
+ elif dblp_key_type == "tr":
+ release_type = "report"
elif title.startswith("Review:"):
- release_type = 'review'
- elif dblp_type == 'inproceedings':
- release_type = 'paper-conference'
- elif dblp_type == 'book':
- release_type = 'book'
- elif dblp_type == 'incollection':
+ release_type = "review"
+ elif dblp_type == "inproceedings":
+ release_type = "paper-conference"
+ elif dblp_type == "book":
+ release_type = "book"
+ elif dblp_type == "incollection":
# XXX: part vs. chapter?
- release_type = 'chapter'
- elif dblp_type == 'data':
- release_type = 'dataset'
- elif dblp_type in ('mastersthesis', 'phdthesis'):
- release_type = 'thesis'
+ release_type = "chapter"
+ elif dblp_type == "data":
+ release_type = "dataset"
+ elif dblp_type in ("mastersthesis", "phdthesis"):
+ release_type = "thesis"
# overrides/extensions of the above
- if publtype == 'informal':
+ if publtype == "informal":
# for conferences, seems to indicate peer-review status
# for journals, seems to indicate things like book reviews; split out above
pass
- elif publtype == 'encyclopedia':
- release_type = 'entry-encyclopedia'
- elif publtype == 'edited':
+ elif publtype == "encyclopedia":
+ release_type = "entry-encyclopedia"
+ elif publtype == "edited":
# XXX: article?
- release_type = 'editorial'
- elif publtype == 'data':
- release_type = 'dataset'
- elif publtype == 'data':
- release_type = 'dataset'
- elif publtype == 'software':
- release_type = 'software'
- elif publtype == 'widthdrawn':
- withdrawn_status = 'widthdrawn'
- elif publtype == 'survey':
+ release_type = "editorial"
+ elif publtype == "data":
+ release_type = "dataset"
+ elif publtype == "data":
+ release_type = "dataset"
+ elif publtype == "software":
+ release_type = "software"
+ elif publtype == "widthdrawn":
+ withdrawn_status = "widthdrawn"
+ elif publtype == "survey":
# XXX: flag as a review/survey article?
pass
- #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
+ # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
container_name = None
booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text)
@@ -236,7 +232,9 @@ class DblpReleaseImporter(EntityImporter):
part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)
# block bogus far-future years/dates
- if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ if release_year is not None and (
+ release_year > (self.this_year + 5) or release_year < 1000
+ ):
release_month = None
release_year = None
@@ -245,39 +243,39 @@ class DblpReleaseImporter(EntityImporter):
if isbn:
ext_ids.isbn13 = isbn
if ext_ids.doi:
- self.counts['has-doi'] += 1
+ self.counts["has-doi"] += 1
# dblp-specific extra
dblp_extra = dict(type=dblp_type)
note = clean_str(xml_elem.note and xml_elem.note.text)
- if note and 'base-search.net' not in note:
- dblp_extra['note'] = note
+ if note and "base-search.net" not in note:
+ dblp_extra["note"] = note
if part_of_key:
- dblp_extra['part_of_key'] = part_of_key
+ dblp_extra["part_of_key"] = part_of_key
# generic extra
extra = dict()
if not container_id and container_name:
- extra['container_name'] = container_name
+ extra["container_name"] = container_name
- if series and (dblp_key_type == 'series' or dblp_type == 'book'):
- extra['series-title'] = series
+ if series and (dblp_key_type == "series" or dblp_type == "book"):
+ extra["series-title"] = series
elif series:
- dblp_extra['series'] = series
+ dblp_extra["series"] = series
- if booktitle and dblp_key_type == 'series':
- extra['container-title'] = booktitle
- elif booktitle and dblp_key_type == 'conf':
- extra['event'] = booktitle
+ if booktitle and dblp_key_type == "series":
+ extra["container-title"] = booktitle
+ elif booktitle and dblp_key_type == "conf":
+ extra["event"] = booktitle
elif booktitle:
- dblp_extra['booktitle'] = booktitle
+ dblp_extra["booktitle"] = booktitle
if release_year and release_month:
# TODO: release_month schema migration
- extra['release_month'] = release_month
+ extra["release_month"] = release_month
if dblp_extra:
- extra['dblp'] = dblp_extra
+ extra["dblp"] = dblp_extra
if not extra:
extra = None
@@ -289,7 +287,7 @@ class DblpReleaseImporter(EntityImporter):
withdrawn_status=withdrawn_status,
title=title,
release_year=release_year,
- #release_date,
+ # release_date,
publisher=publisher,
ext_ids=ext_ids,
contribs=contribs,
@@ -302,8 +300,8 @@ class DblpReleaseImporter(EntityImporter):
if self.dump_json_mode:
re_dict = entity_to_dict(re, api_client=self.api.api_client)
- re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem)
- re_dict['_dblp_prefix'] = dblp_prefix
+ re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem)
+ re_dict["_dblp_prefix"] = dblp_prefix
print(json.dumps(re_dict, sort_keys=True))
return False
@@ -341,11 +339,11 @@ class DblpReleaseImporter(EntityImporter):
# then try other ext_id lookups
if not existing:
- for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'):
+ for extid_type in ("doi", "wikidata_qid", "isbn13", "arxiv"):
extid_val = getattr(re.ext_ids, extid_type)
if not extid_val:
continue
- #print(f" lookup release type: {extid_type} val: {extid_val}")
+ # print(f" lookup release type: {extid_type} val: {extid_val}")
try:
existing = self.api.lookup_release(**{extid_type: extid_val})
except fatcat_openapi_client.rest.ApiException as err:
@@ -373,12 +371,14 @@ class DblpReleaseImporter(EntityImporter):
return True
if not self.do_updates or existing.ext_ids.dblp:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# logic for whether to do update or skip
- if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv:
- self.counts['skip-update'] += 1
+ if (
+ existing.container_id and existing.release_type and existing.release_stage
+ ) or existing.ext_ids.arxiv:
+ self.counts["skip-update"] += 1
return False
# fields to copy over for update
@@ -390,20 +390,20 @@ class DblpReleaseImporter(EntityImporter):
existing.release_stage = existing.release_stage or re.release_stage
existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
existing.container_id = existing.container_id or re.container_id
- existing.extra['dblp'] = re.extra['dblp']
+ existing.extra["dblp"] = re.extra["dblp"]
existing.volume = existing.volume or re.volume
existing.issue = existing.issue or re.issue
existing.pages = existing.pages or re.pages
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
except fatcat_openapi_client.rest.ApiException as err:
# there is a code path where we try to update the same release
# twice in a row; if that happens, just skip
# NOTE: API behavior might change in the future?
if "release_edit_editgroup_id_ident_id_key" in err.body:
- self.counts['skip-update-conflict'] += 1
+ self.counts["skip-update-conflict"] += 1
return False
else:
raise err
@@ -411,11 +411,14 @@ class DblpReleaseImporter(EntityImporter):
return False
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
"""
@@ -428,14 +431,14 @@ class DblpReleaseImporter(EntityImporter):
"""
contribs = []
index = 0
- for elem in authors.find_all('author'):
+ for elem in authors.find_all("author"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "author"
contrib.index = index
contribs.append(contrib)
index += 1
- for elem in authors.find_all('editor'):
+ for elem in authors.find_all("editor"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "editor"
contribs.append(contrib)
@@ -459,10 +462,10 @@ class DblpReleaseImporter(EntityImporter):
# remove number in author name, if present
if raw_name.split()[-1].isdigit():
- raw_name = ' '.join(raw_name.split()[:-1])
+ raw_name = " ".join(raw_name.split()[:-1])
- if elem.get('orcid'):
- orcid = clean_orcid(elem['orcid'])
+ if elem.get("orcid"):
+ orcid = clean_orcid(elem["orcid"])
if orcid:
creator_id = self.lookup_orcid(orcid)
if not creator_id:
@@ -491,22 +494,26 @@ class DblpReleaseImporter(EntityImporter):
wikidata_qid: Optional[str] = None
arxiv_id: Optional[str] = None
hdl: Optional[str] = None
- for ee in xml_elem.find_all('ee'):
+ for ee in xml_elem.find_all("ee"):
url = ee.text
# convert DOI-like domains, which mostly have DOIs anyways
- if '://doi.acm.org/' in url:
- url = url.replace('://doi.acm.org/', '://doi.org/')
- elif '://doi.ieeecomputersociety.org/' in url:
- url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/')
+ if "://doi.acm.org/" in url:
+ url = url.replace("://doi.acm.org/", "://doi.org/")
+ elif "://doi.ieeecomputersociety.org/" in url:
+ url = url.replace("://doi.ieeecomputersociety.org/", "://doi.org/")
- if 'doi.org/10.' in url and not doi:
+ if "doi.org/10." in url and not doi:
doi = clean_doi(url)
- elif 'wikidata.org/entity/Q' in url and not wikidata_qid:
+ elif "wikidata.org/entity/Q" in url and not wikidata_qid:
wikidata_qid = clean_wikidata_qid(url)
- elif '://arxiv.org/abs/' in url and not arxiv_id:
- arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '')
+ elif "://arxiv.org/abs/" in url and not arxiv_id:
+ arxiv_id = (
+ url.replace("http://", "")
+ .replace("https://", "")
+ .replace("arxiv.org/abs/", "")
+ )
arxiv_id = clean_arxiv_id(arxiv_id)
- elif '://hdl.handle.net' in url and not hdl:
+ elif "://hdl.handle.net" in url and not hdl:
hdl = clean_hdl(url)
return fatcat_openapi_client.ReleaseExtIds(
@@ -525,14 +532,14 @@ class DblpReleaseImporter(EntityImporter):
sandcrawler ingest requests.
"""
EXTID_PATTERNS = [
- '://doi.acm.org/',
- '://doi.ieeecomputersociety.org/',
- 'doi.org/10.',
- 'wikidata.org/entity/Q',
- '://arxiv.org/abs/',
+ "://doi.acm.org/",
+ "://doi.ieeecomputersociety.org/",
+ "doi.org/10.",
+ "wikidata.org/entity/Q",
+ "://arxiv.org/abs/",
]
urls = []
- for ee in xml_elem.find_all('ee'):
+ for ee in xml_elem.find_all("ee"):
url = ee.text
skip = False
for pattern in EXTID_PATTERNS:
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index 1831c4cd..cd063337 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -28,26 +28,23 @@ MAX_ABSTRACT_LENGTH = 2048
class DoajArticleImporter(EntityImporter):
-
- def __init__(self,
- api,
- issn_map_file,
- **kwargs):
+ def __init__(self, api, issn_map_file, **kwargs):
eg_desc = kwargs.get(
- 'editgroup_description',
- "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps"
+ "editgroup_description",
+ "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps",
)
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent',
- 'fatcat_tools.DoajArticleImporter')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DoajArticleImporter")
# ensure default is to not do updates with this worker (override super() default)
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- issn_map_file=issn_map_file,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(
+ api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs,
+ )
self.this_year = datetime.datetime.now().year
self.read_issn_map_file(issn_map_file)
@@ -82,21 +79,21 @@ class DoajArticleImporter(EntityImporter):
}
"""
- if not obj or not isinstance(obj, dict) or 'bibjson' not in obj:
- self.counts['skip-empty'] += 1
+ if not obj or not isinstance(obj, dict) or "bibjson" not in obj:
+ self.counts["skip-empty"] += 1
return None
- bibjson = obj['bibjson']
+ bibjson = obj["bibjson"]
- title = clean_str(bibjson.get('title'), force_xml=True)
+ title = clean_str(bibjson.get("title"), force_xml=True)
if not title:
- self.counts['skip-title'] += 1
+ self.counts["skip-title"] += 1
return False
- container_name = clean_str(bibjson['journal']['title'])
+ container_name = clean_str(bibjson["journal"]["title"])
container_id = None
# NOTE: 'issns' not documented in API schema
- for issn in bibjson['journal']['issns']:
+ for issn in bibjson["journal"]["issns"]:
issnl = self.issn2issnl(issn)
if issnl:
container_id = self.lookup_issnl(self.issn2issnl(issn))
@@ -105,75 +102,83 @@ class DoajArticleImporter(EntityImporter):
container_name = None
break
- volume = clean_str(bibjson['journal'].get('volume'))
+ volume = clean_str(bibjson["journal"].get("volume"))
# NOTE: this schema seems to use "number" as "issue number"
- issue = clean_str(bibjson['journal'].get('number'))
- publisher = clean_str(bibjson['journal'].get('publisher'))
+ issue = clean_str(bibjson["journal"].get("number"))
+ publisher = clean_str(bibjson["journal"].get("publisher"))
try:
- release_year = int(bibjson.get('year'))
+ release_year = int(bibjson.get("year"))
except (TypeError, ValueError):
release_year = None
- release_month = parse_month(clean_str(bibjson.get('month')))
+ release_month = parse_month(clean_str(bibjson.get("month")))
# block bogus far-future years/dates
- if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ if release_year is not None and (
+ release_year > (self.this_year + 5) or release_year < 1000
+ ):
release_month = None
release_year = None
- license_slug = self.doaj_license_slug(bibjson['journal'].get('license'))
- country = parse_country_name(bibjson['journal'].get('country'))
+ license_slug = self.doaj_license_slug(bibjson["journal"].get("license"))
+ country = parse_country_name(bibjson["journal"].get("country"))
language = None
- for raw in bibjson['journal'].get('language') or []:
+ for raw in bibjson["journal"].get("language") or []:
language = parse_lang_name(raw)
if language:
break
# pages
# NOTE: error in API docs? seems like start_page not under 'journal' object
- start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page'))
- end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page'))
+ start_page = clean_str(bibjson["journal"].get("start_page")) or clean_str(
+ bibjson.get("start_page")
+ )
+ end_page = clean_str(bibjson["journal"].get("end_page")) or clean_str(
+ bibjson.get("end_page")
+ )
pages: Optional[str] = None
if start_page and end_page:
pages = f"{start_page}-{end_page}"
elif start_page:
pages = start_page
- doaj_article_id = obj['id'].lower()
- ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id)
+ doaj_article_id = obj["id"].lower()
+ ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id)
abstracts = self.doaj_abstracts(bibjson)
- contribs = self.doaj_contribs(bibjson.get('author') or [])
+ contribs = self.doaj_contribs(bibjson.get("author") or [])
# DOAJ-specific extra
doaj_extra = dict()
- if bibjson.get('subject'):
- doaj_extra['subject'] = bibjson.get('subject')
- if bibjson.get('keywords'):
- doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k]
+ if bibjson.get("subject"):
+ doaj_extra["subject"] = bibjson.get("subject")
+ if bibjson.get("keywords"):
+ doaj_extra["keywords"] = [
+ k for k in [clean_str(s) for s in bibjson.get("keywords")] if k
+ ]
# generic extra
extra = dict()
if country:
- extra['country'] = country
+ extra["country"] = country
if not container_id and container_name:
- extra['container_name'] = container_name
+ extra["container_name"] = container_name
if release_year and release_month:
# TODO: schema migration
- extra['release_month'] = release_month
+ extra["release_month"] = release_month
if doaj_extra:
- extra['doaj'] = doaj_extra
+ extra["doaj"] = doaj_extra
if not extra:
extra = None
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
container_id=container_id,
- release_type='article-journal',
- release_stage='published',
+ release_type="article-journal",
+ release_stage="published",
title=title,
release_year=release_year,
- #release_date,
+ # release_date,
publisher=publisher,
ext_ids=ext_ids,
contribs=contribs,
@@ -208,11 +213,11 @@ class DoajArticleImporter(EntityImporter):
# then try other ext_id lookups
if not existing:
- for extid_type in ('doi', 'pmid', 'pmcid'):
+ for extid_type in ("doi", "pmid", "pmcid"):
extid_val = getattr(re.ext_ids, extid_type)
if not extid_val:
continue
- #print(f" lookup release type: {extid_type} val: {extid_val}")
+ # print(f" lookup release type: {extid_type} val: {extid_val}")
try:
existing = self.api.lookup_release(**{extid_type: extid_val})
except fatcat_openapi_client.rest.ApiException as err:
@@ -241,7 +246,7 @@ class DoajArticleImporter(EntityImporter):
# other logic could go here about skipping updates
if not self.do_updates or existing.ext_ids.doaj:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# fields to copy over for update
@@ -250,7 +255,7 @@ class DoajArticleImporter(EntityImporter):
existing.release_stage = existing.release_stage or re.release_stage
existing.container_id = existing.container_id or re.container_id
existing.abstracts = existing.abstracts or re.abstracts
- existing.extra['doaj'] = re.extra['doaj']
+ existing.extra["doaj"] = re.extra["doaj"]
existing.volume = existing.volume or re.volume
existing.issue = existing.issue or re.issue
existing.pages = existing.pages or re.pages
@@ -258,13 +263,13 @@ class DoajArticleImporter(EntityImporter):
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
except fatcat_openapi_client.rest.ApiException as err:
# there is a code path where we try to update the same release
# twice in a row; if that happens, just skip
# NOTE: API behavior might change in the future?
if "release_edit_editgroup_id_ident_id_key" in err.body:
- self.counts['skip-update-conflict'] += 1
+ self.counts["skip-update-conflict"] += 1
return False
else:
raise err
@@ -272,14 +277,17 @@ class DoajArticleImporter(EntityImporter):
return False
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]:
- text = clean_str(bibjson.get('abstract'))
+ text = clean_str(bibjson.get("abstract"))
if not text or len(text) < 10:
return []
if len(text) > MAX_ABSTRACT_LENGTH:
@@ -293,7 +301,9 @@ class DoajArticleImporter(EntityImporter):
lang=lang,
)
- return [abstract,]
+ return [
+ abstract,
+ ]
def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
"""
@@ -306,23 +316,27 @@ class DoajArticleImporter(EntityImporter):
contribs = []
index = 0
for author in authors:
- if not author.get('name'):
+ if not author.get("name"):
continue
creator_id = None
- orcid = clean_orcid(author.get('orcid_id'))
+ orcid = clean_orcid(author.get("orcid_id"))
if orcid:
creator_id = self.lookup_orcid(orcid)
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- raw_name=author.get('name'),
- role='author',
- index=index,
- creator_id=creator_id,
- raw_affiliation=clean_str(author.get('affiliation')),
- ))
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ raw_name=author.get("name"),
+ role="author",
+ index=index,
+ creator_id=creator_id,
+ raw_affiliation=clean_str(author.get("affiliation")),
+ )
+ )
index += 1
return contribs
- def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds:
+ def doaj_ext_ids(
+ self, identifiers: List[dict], doaj_article_id: str
+ ) -> fatcat_openapi_client.ReleaseExtIds:
"""
bibjson.identifier {
id (string),
@@ -336,14 +350,14 @@ class DoajArticleImporter(EntityImporter):
pmid: Optional[str] = None
pmcid: Optional[str] = None
for id_obj in identifiers:
- if not id_obj.get('id'):
+ if not id_obj.get("id"):
continue
- if id_obj['type'].lower() == 'doi':
- doi = clean_doi(id_obj['id'])
- elif id_obj['type'].lower() == 'pmid':
- pmid = clean_pmid(id_obj['id'])
- elif id_obj['type'].lower() == 'pmcid':
- pmcid = clean_pmcid(id_obj['id'])
+ if id_obj["type"].lower() == "doi":
+ doi = clean_doi(id_obj["id"])
+ elif id_obj["type"].lower() == "pmid":
+ pmid = clean_pmid(id_obj["id"])
+ elif id_obj["type"].lower() == "pmcid":
+ pmcid = clean_pmcid(id_obj["id"])
return fatcat_openapi_client.ReleaseExtIds(
doaj=doaj_article_id,
@@ -365,10 +379,10 @@ class DoajArticleImporter(EntityImporter):
if not license_list:
return None
for license in license_list:
- if not license.get('open_access'):
+ if not license.get("open_access"):
continue
- slug = license.get('type')
- if slug.startswith('CC '):
- slug = slug.replace('CC ', 'cc-').lower()
+ slug = license.get("type")
+ if slug.startswith("CC "):
+ slug = slug.replace("CC ", "cc-").lower()
return slug
return None
diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py
index 0951ed84..26584ff3 100644
--- a/python/fatcat_tools/importers/file_meta.py
+++ b/python/fatcat_tools/importers/file_meta.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from .common import EntityImporter
@@ -17,19 +16,16 @@ class FileMetaImporter(EntityImporter):
def __init__(self, api, require_grobid=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter')
- kwargs['do_updates'] = kwargs.get("do_updates", True)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates"
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileMetaImporter")
+ kwargs["do_updates"] = kwargs.get("do_updates", True)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
- for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'):
+ for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"):
if not row.get(k):
- self.counts['skip-missing-field'] += 1
+ self.counts["skip-missing-field"] += 1
return False
return True
@@ -40,11 +36,11 @@ class FileMetaImporter(EntityImporter):
file_meta = row
fe = fatcat_openapi_client.FileEntity(
- md5=file_meta['md5hex'],
- sha1=file_meta['sha1hex'],
- sha256=file_meta['sha256hex'],
- size=file_meta['size_bytes'],
- mimetype=file_meta['mimetype'],
+ md5=file_meta["md5hex"],
+ sha1=file_meta["sha1hex"],
+ sha256=file_meta["sha256hex"],
+ size=file_meta["size_bytes"],
+ mimetype=file_meta["mimetype"],
)
return fe
@@ -59,11 +55,11 @@ class FileMetaImporter(EntityImporter):
raise err
if not existing:
- self.counts['skip-no-match'] += 1
+ self.counts["skip-no-match"] += 1
return False
- if (existing.md5 and existing.sha256 and existing.size and existing.mimetype):
- self.counts['skip-existing-complete'] += 1
+ if existing.md5 and existing.sha256 and existing.size and existing.mimetype:
+ self.counts["skip-existing-complete"] += 1
return False
existing.md5 = existing.md5 or fe.md5
@@ -75,5 +71,5 @@ class FileMetaImporter(EntityImporter):
existing = self.generic_file_cleanups(existing)
self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py
index 43c2a49c..dd8f5600 100644
--- a/python/fatcat_tools/importers/fileset_generic.py
+++ b/python/fatcat_tools/importers/fileset_generic.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from fatcat_tools import entity_from_dict
@@ -20,34 +19,31 @@ class FilesetImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Generic Fileset entity import"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FilesetImporter')
- kwargs['do_updates'] = bool(kwargs.get("do_updates", False))
+ eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import"
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FilesetImporter")
+ kwargs["do_updates"] = bool(kwargs.get("do_updates", False))
self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False))
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
# bezerk mode doesn't make sense for this importer
assert self.bezerk_mode is False
def want(self, row):
- if not row.get('release_ids'):
- self.counts['skip-no-release-ids'] += 1
+ if not row.get("release_ids"):
+ self.counts["skip-no-release-ids"] += 1
return False
- if not row.get('urls'):
- self.counts['skip-no-urls'] += 1
+ if not row.get("urls"):
+ self.counts["skip-no-urls"] += 1
return False
- if not row.get('manifest'):
- self.counts['skip-no-files'] += 1
+ if not row.get("manifest"):
+ self.counts["skip-no-files"] += 1
return False
- for f in row.get('manifest'):
- for k in ('sha1', 'md5'):
+ for f in row.get("manifest"):
+ for k in ("sha1", "md5"):
if not f.get(k):
- self.counts['skip-missing-file-field'] += 1
+ self.counts["skip-missing-file-field"] += 1
return False
return True
@@ -66,19 +62,24 @@ class FilesetImporter(EntityImporter):
if not self.skip_release_fileset_check:
for release_id in fse.release_ids:
# don't catch 404, that would be an error
- release = self.api.get_release(release_id, expand='filesets', hide='abstracts,refs')
- assert release.state == 'active'
+ release = self.api.get_release(
+ release_id, expand="filesets", hide="abstracts,refs"
+ )
+ assert release.state == "active"
if release.filesets:
- self.counts['exists'] += 1
- self.counts['exists-via-release-filesets'] += 1
+ self.counts["exists"] += 1
+ self.counts["exists-via-release-filesets"] += 1
return False
# do the insert
return True
def insert_batch(self, batch):
- self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_fileset_auto_batch(
+ fatcat_openapi_client.FilesetAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 0f666652..f7bb5357 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,7 +7,7 @@ import fatcat_openapi_client
from .common import EntityImporter, clean, make_rel_url
-MAX_ABSTRACT_BYTES=4096
+MAX_ABSTRACT_BYTES = 4096
class GrobidMetadataImporter(EntityImporter):
@@ -24,14 +24,13 @@ class GrobidMetadataImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Import of release and file metadata, as extracted from PDFs by GROBID.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Import of release and file metadata, as extracted from PDFs by GROBID.",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.GrobidMetadataImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.longtail_oa = kwargs.get("longtail_oa", False)
@@ -40,7 +39,7 @@ class GrobidMetadataImporter(EntityImporter):
def parse_record(self, row):
- fields = row.split('\t')
+ fields = row.split("\t")
sha1_key = fields[0]
cdx = json.loads(fields[1])
mimetype = fields[2]
@@ -65,8 +64,8 @@ class GrobidMetadataImporter(EntityImporter):
# TODO: this is where we should check if the file actually has
# release_ids and/or URLs associated with it
if existing and not self.bezerk_mode:
- self.counts['exists'] += 1
- self.counts['skip'] -= 1
+ self.counts["exists"] += 1
+ self.counts["skip"] -= 1
return None
release_edit = self.create_release(re)
@@ -75,75 +74,81 @@ class GrobidMetadataImporter(EntityImporter):
def parse_grobid_json(self, obj):
- if not obj.get('title'):
+ if not obj.get("title"):
return None
extra_grobid = dict()
- abstract = obj.get('abstract')
+ abstract = obj.get("abstract")
if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
abobj = fatcat_openapi_client.ReleaseAbstract(
- mimetype="text/plain",
- content=clean(obj.get('abstract')))
+ mimetype="text/plain", content=clean(obj.get("abstract"))
+ )
abstracts = [abobj]
else:
abstracts = None
contribs = []
- for i, a in enumerate(obj.get('authors', [])):
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- index=i,
- raw_name=clean(a['name']),
- given_name=clean(a.get('given_name')),
- surname=clean(a.get('surname')),
- role="author",
- extra=None))
+ for i, a in enumerate(obj.get("authors", [])):
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ index=i,
+ raw_name=clean(a["name"]),
+ given_name=clean(a.get("given_name")),
+ surname=clean(a.get("surname")),
+ role="author",
+ extra=None,
+ )
+ )
refs = []
- for raw in obj.get('citations', []):
+ for raw in obj.get("citations", []):
cite_extra = dict()
year = None
- if raw.get('date'):
+ if raw.get("date"):
try:
- year = int(raw['date'].strip()[:4])
+ year = int(raw["date"].strip()[:4])
except (IndexError, ValueError):
pass
- for key in ('volume', 'url', 'issue', 'publisher'):
+ for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
cite_extra[key] = clean(raw[key])
- if raw.get('authors'):
- cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
+ if raw.get("authors"):
+ cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
if not cite_extra:
cite_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- key=clean(raw.get('id')),
- year=year,
- title=clean(raw['title']),
- extra=cite_extra))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ key=clean(raw.get("id")),
+ year=year,
+ title=clean(raw["title"]),
+ extra=cite_extra,
+ )
+ )
release_date = None
release_year = None
- if obj.get('date'):
+ if obj.get("date"):
# only returns year, ever?
- release_year = int(obj['date'][:4])
+ release_year = int(obj["date"][:4])
extra = dict()
- if obj.get('doi'):
- extra['doi'] = obj['doi']
- if obj['journal'] and obj['journal'].get('name'):
- extra['container_name'] = clean(obj['journal']['name'])
+ if obj.get("doi"):
+ extra["doi"] = obj["doi"]
+ if obj["journal"] and obj["journal"].get("name"):
+ extra["container_name"] = clean(obj["journal"]["name"])
# TODO: ISSN/eISSN handling? or just journal name lookup?
if extra_grobid:
- extra['grobid'] = extra_grobid
+ extra["grobid"] = extra_grobid
if self.longtail_oa:
- extra['longtail_oa'] = True
+ extra["longtail_oa"] = True
if not extra:
extra = None
- title = clean(obj['title'], force_xml=True)
+ title = clean(obj["title"], force_xml=True)
if not title or len(title) < 2:
return None
@@ -154,17 +159,22 @@ class GrobidMetadataImporter(EntityImporter):
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=clean(obj['journal'].get('publisher')),
- volume=clean(obj['journal'].get('volume')),
- issue=clean(obj['journal'].get('issue')),
+ publisher=clean(obj["journal"].get("publisher")),
+ volume=clean(obj["journal"].get("volume")),
+ issue=clean(obj["journal"].get("issue")),
abstracts=abstracts,
ext_ids=fatcat_openapi_client.ReleaseExtIds(),
- extra=extra)
+ extra=extra,
+ )
return re
def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):
- sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
+ sha1 = (
+ base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", "")))
+ .decode("ascii")
+ .lower()
+ )
fe = fatcat_openapi_client.FileEntity(
sha1=sha1,
@@ -175,16 +185,15 @@ class GrobidMetadataImporter(EntityImporter):
)
# parse URLs and CDX
- original = cdx['url']
- assert len(cdx['dt']) >= 8
- wayback = "https://web.archive.org/web/{}/{}".format(
- cdx['dt'],
- original)
- fe.urls.append(
- fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))
+ original = cdx["url"]
+ assert len(cdx["dt"]) >= 8
+ wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)
+ fe.urls.append(fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))
original_url = make_rel_url(original, default_link_rel=self.default_link_rel)
if original_url is not None:
- fe.urls.append(fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1]))
+ fe.urls.append(
+ fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1])
+ )
return fe
@@ -193,8 +202,11 @@ class GrobidMetadataImporter(EntityImporter):
return True
def insert_batch(self, batch):
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index f0943c1e..e0a6c3f5 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -1,4 +1,3 @@
-
import datetime
import fatcat_openapi_client
@@ -7,17 +6,16 @@ from .common import EntityImporter, make_rel_url
class IngestFileResultImporter(EntityImporter):
-
def __init__(self, api, require_grobid=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter')
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Files crawled from web using sandcrawler ingest tool"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFileResultImporter")
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.use_glutton_match = False
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
@@ -27,20 +25,20 @@ class IngestFileResultImporter(EntityImporter):
else:
print("NOT checking GROBID success")
self.ingest_request_source_allowlist = [
- 'fatcat-changelog',
- 'fatcat-ingest-container',
- 'fatcat-ingest',
- 'arabesque',
+ "fatcat-changelog",
+ "fatcat-ingest-container",
+ "fatcat-ingest",
+ "arabesque",
#'mag-corpus',
#'mag',
- 'unpaywall-corpus',
- 'unpaywall',
+ "unpaywall-corpus",
+ "unpaywall",
#'s2-corpus',
#'s2',
- 'doaj',
- 'dblp',
+ "doaj",
+ "dblp",
]
- if kwargs.get('skip_source_allowlist', False):
+ if kwargs.get("skip_source_allowlist", False):
self.ingest_request_source_allowlist = []
def want_file(self, row) -> bool:
@@ -48,28 +46,32 @@ class IngestFileResultImporter(EntityImporter):
File-specific part of want(). Generic across general ingest and save-paper-now.
"""
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+ if not row.get("file_meta"):
+ self.counts["skip-file-meta"] += 1
return False
# type-specific filters
- if row['request'].get('ingest_type') == 'pdf':
- if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
- self.counts['skip-grobid'] += 1
+ if row["request"].get("ingest_type") == "pdf":
+ if self.require_grobid and row.get("grobid", {}).get("status_code") != 200:
+ self.counts["skip-grobid"] += 1
return False
- if row['file_meta'].get('mimetype') not in ("application/pdf",):
- self.counts['skip-mimetype'] += 1
+ if row["file_meta"].get("mimetype") not in ("application/pdf",):
+ self.counts["skip-mimetype"] += 1
return False
- elif row['request'].get('ingest_type') == 'xml':
- if row['file_meta'].get('mimetype') not in ("application/xml",
- "application/jats+xml", "application/tei+xml", "text/xml"):
- self.counts['skip-mimetype'] += 1
+ elif row["request"].get("ingest_type") == "xml":
+ if row["file_meta"].get("mimetype") not in (
+ "application/xml",
+ "application/jats+xml",
+ "application/tei+xml",
+ "text/xml",
+ ):
+ self.counts["skip-mimetype"] += 1
return False
- elif row['request'].get('ingest_type') in ['component', 'src', 'dataset-file']:
+ elif row["request"].get("ingest_type") in ["component", "src", "dataset-file"]:
# we rely on sandcrawler for these checks
pass
else:
- self.counts['skip-ingest-type'] += 1
+ self.counts["skip-ingest-type"] += 1
return False
return True
@@ -79,24 +81,36 @@ class IngestFileResultImporter(EntityImporter):
Sandcrawler ingest-specific part of want(). Generic across file and
webcapture ingest.
"""
- if row.get('hit') is not True:
- self.counts['skip-hit'] += 1
+ if row.get("hit") is not True:
+ self.counts["skip-hit"] += 1
return False
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist:
- self.counts['skip-ingest_request_source'] += 1
+ if (
+ self.ingest_request_source_allowlist
+ and source not in self.ingest_request_source_allowlist
+ ):
+ self.counts["skip-ingest_request_source"] += 1
return False
- if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'):
- self.counts['skip-link-source'] += 1
+ if row["request"].get("link_source") not in (
+ "arxiv",
+ "pmc",
+ "unpaywall",
+ "doi",
+ "mag",
+ "s2",
+ "doaj",
+ "dblp",
+ ):
+ self.counts["skip-link-source"] += 1
return False
- if source.startswith('savepapernow'):
+ if source.startswith("savepapernow"):
# never process async savepapernow requests
- self.counts['skip-savepapernow'] += 1
+ self.counts["skip-savepapernow"] += 1
return False
return True
@@ -125,19 +139,19 @@ class IngestFileResultImporter(EntityImporter):
def parse_ingest_release_ident(self, row):
- request = row['request']
- fatcat = request.get('fatcat')
+ request = row["request"]
+ fatcat = request.get("fatcat")
release_ident = None
- if fatcat and fatcat.get('release_ident'):
- release_ident = fatcat.get('release_ident')
- elif request.get('ext_ids'):
+ if fatcat and fatcat.get("release_ident"):
+ release_ident = fatcat.get("release_ident")
+ elif request.get("ext_ids"):
# if no fatcat ident, try extids
- for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv', 'doaj', 'dblp'):
- extid = request['ext_ids'].get(extid_type)
+ for extid_type in ("doi", "pmid", "pmcid", "arxiv", "doaj", "dblp"):
+ extid = request["ext_ids"].get(extid_type)
if not extid:
continue
- if extid_type == 'doi':
+ if extid_type == "doi":
extid = extid.lower()
try:
release = self.api.lookup_release(**{extid_type: extid})
@@ -145,66 +159,69 @@ class IngestFileResultImporter(EntityImporter):
if err.status == 404:
continue
elif err.status == 400:
- self.counts['warn-extid-invalid'] += 1
+ self.counts["warn-extid-invalid"] += 1
continue
raise err
# verify release_stage
- if request.get('release_stage') and release.release_stage:
- if request['release_stage'] != release.release_stage:
- self.counts['skip-release-stage'] += 1
+ if request.get("release_stage") and release.release_stage:
+ if request["release_stage"] != release.release_stage:
+ self.counts["skip-release-stage"] += 1
return None
release_ident = release.ident
break
- if self.use_glutton_match and not release_ident and row.get('grobid'):
+ if self.use_glutton_match and not release_ident and row.get("grobid"):
# try biblio-glutton extracted hit
- if row['grobid'].get('fatcat_release'):
- release_ident = row['grobid']['fatcat_release'].split('_')[-1]
- self.counts['glutton-match'] += 1
+ if row["grobid"].get("fatcat_release"):
+ release_ident = row["grobid"]["fatcat_release"].split("_")[-1]
+ self.counts["glutton-match"] += 1
return release_ident
def parse_terminal(self, row):
- terminal = row.get('terminal')
+ terminal = row.get("terminal")
if not terminal:
# support old cdx-only ingest results
- cdx = row.get('cdx')
+ cdx = row.get("cdx")
if not cdx:
return None
else:
terminal = {
- 'terminal_url': cdx['url'],
- 'terminal_dt': cdx['datetime'],
- 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'),
+ "terminal_url": cdx["url"],
+ "terminal_dt": cdx["datetime"],
+ "terminal_status_code": cdx.get("status_code") or cdx.get("http_status"),
}
# work around old schema
- if 'terminal_url' not in terminal:
- terminal['terminal_url'] = terminal['url']
- if 'terminal_dt' not in terminal:
- terminal['terminal_dt'] = terminal['dt']
+ if "terminal_url" not in terminal:
+ terminal["terminal_url"] = terminal["url"]
+ if "terminal_dt" not in terminal:
+ terminal["terminal_dt"] = terminal["dt"]
# convert CDX-style digits to ISO-style timestamp
- assert len(terminal['terminal_dt']) == 14
- terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z"
+ assert len(terminal["terminal_dt"]) == 14
+ terminal["terminal_timestamp"] = (
+ datetime.datetime.strptime(terminal["terminal_dt"], "%Y%m%d%H%M%S").isoformat()
+ + "Z"
+ )
return terminal
def parse_urls(self, row, terminal):
- request = row['request']
+ request = row["request"]
default_rel = self.default_link_rel
- if request.get('link_source') == 'doi':
- default_rel = 'publisher'
- default_rel = request.get('rel', default_rel)
- url = make_rel_url(terminal['terminal_url'], default_rel)
+ if request.get("link_source") == "doi":
+ default_rel = "publisher"
+ default_rel = request.get("rel", default_rel)
+ url = make_rel_url(terminal["terminal_url"], default_rel)
if not url:
- self.counts['skip-url'] += 1
+ self.counts["skip-url"] += 1
return None
wayback = "https://web.archive.org/web/{}/{}".format(
- terminal['terminal_dt'],
- terminal['terminal_url'])
+ terminal["terminal_dt"], terminal["terminal_url"]
+ )
urls = [url, ("webarchive", wayback)]
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
@@ -212,38 +229,38 @@ class IngestFileResultImporter(EntityImporter):
def parse_edit_extra(self, row):
- request = row['request']
+ request = row["request"]
edit_extra = dict()
- if request.get('edit_extra'):
- edit_extra = request['edit_extra']
+ if request.get("edit_extra"):
+ edit_extra = request["edit_extra"]
- if request.get('ingest_request_source'):
- edit_extra['ingest_request_source'] = request['ingest_request_source']
- if request.get('link_source') and request.get('link_source_id'):
- edit_extra['link_source'] = request['link_source']
- edit_extra['link_source_id'] = request['link_source_id']
- if edit_extra['link_source'] == 'doi':
- edit_extra['link_source_id'] = edit_extra['link_source_id'].lower()
+ if request.get("ingest_request_source"):
+ edit_extra["ingest_request_source"] = request["ingest_request_source"]
+ if request.get("link_source") and request.get("link_source_id"):
+ edit_extra["link_source"] = request["link_source"]
+ edit_extra["link_source_id"] = request["link_source_id"]
+ if edit_extra["link_source"] == "doi":
+ edit_extra["link_source_id"] = edit_extra["link_source_id"].lower()
# GROBID metadata, for SPN requests (when there might not be 'success')
- if request.get('ingest_type') == 'pdf':
- if row.get('grobid') and row['grobid'].get('status') != 'success':
- edit_extra['grobid_status_code'] = row['grobid']['status_code']
- edit_extra['grobid_version'] = row['grobid'].get('grobid_version')
+ if request.get("ingest_type") == "pdf":
+ if row.get("grobid") and row["grobid"].get("status") != "success":
+ edit_extra["grobid_status_code"] = row["grobid"]["status_code"]
+ edit_extra["grobid_version"] = row["grobid"].get("grobid_version")
return edit_extra
def parse_record(self, row):
- request = row['request']
- file_meta = row['file_meta']
+ request = row["request"]
+ file_meta = row["file_meta"]
# double check that want() filtered request correctly (eg, old requests)
- if request.get('ingest_type') not in ('pdf', 'xml'):
- self.counts['skip-ingest-type'] += 1
+ if request.get("ingest_type") not in ("pdf", "xml"):
+ self.counts["skip-ingest-type"] += 1
return None
- assert (request['ingest_type'], file_meta['mimetype']) in [
+ assert (request["ingest_type"], file_meta["mimetype"]) in [
("pdf", "application/pdf"),
("xml", "application/xml"),
("xml", "application/jats+xml"),
@@ -255,23 +272,23 @@ class IngestFileResultImporter(EntityImporter):
release_ident = self.parse_ingest_release_ident(row)
if not release_ident:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
terminal = self.parse_terminal(row)
if not terminal:
# TODO: support archive.org hits?
- self.counts['skip-no-terminal'] += 1
+ self.counts["skip-no-terminal"] += 1
return None
urls = self.parse_urls(row, terminal)
fe = fatcat_openapi_client.FileEntity(
- md5=file_meta['md5hex'],
- sha1=file_meta['sha1hex'],
- sha256=file_meta['sha256hex'],
- size=file_meta['size_bytes'],
- mimetype=file_meta['mimetype'],
+ md5=file_meta["md5hex"],
+ sha1=file_meta["sha1hex"],
+ sha256=file_meta["sha256hex"],
+ size=file_meta["size_bytes"],
+ mimetype=file_meta["mimetype"],
release_ids=[release_ident],
urls=urls,
)
@@ -293,7 +310,7 @@ class IngestFileResultImporter(EntityImporter):
# check for existing edits-in-progress with same file hash
for other in self._entity_queue:
if other.sha1 == fe.sha1:
- self.counts['skip-in-queue'] += 1
+ self.counts["skip-in-queue"] += 1
return False
if not existing:
@@ -302,31 +319,36 @@ class IngestFileResultImporter(EntityImporter):
# NOTE: the following checks all assume there is an existing item
if (fe.release_ids[0] in existing.release_ids) and existing.urls:
# TODO: could still, in theory update with the new URL?
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if not self.do_updates:
- self.counts['skip-update-disabled'] += 1
+ self.counts["skip-update-disabled"] += 1
return False
# TODO: for now, never update
- self.counts['skip-update-disabled'] += 1
+ self.counts["skip-update-disabled"] += 1
return False
def insert_batch(self, batch):
if self.submit_mode:
- eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ eg = self.api.create_editgroup(
+ fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
for fe in batch:
self.api.create_file(eg.editgroup_id, fe)
self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
else:
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
class SavePaperNowFileImporter(IngestFileResultImporter):
@@ -338,29 +360,29 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
def __init__(self, api, submit_mode=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFileImporter')
- kwargs['submit_mode'] = submit_mode
- kwargs['require_grobid'] = False
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Files crawled after a public 'Save Paper Now' request"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFileImporter")
+ kwargs["submit_mode"] = submit_mode
+ kwargs["require_grobid"] = False
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if not source.startswith('savepapernow'):
- self.counts['skip-not-savepapernow'] += 1
+ if not source.startswith("savepapernow"):
+ self.counts["skip-not-savepapernow"] += 1
return False
- if row.get('hit') is not True:
- self.counts['skip-hit'] += 1
+ if row.get("hit") is not True:
+ self.counts["skip-hit"] += 1
return False
if not self.want_file(row):
@@ -377,14 +399,14 @@ class IngestWebResultImporter(IngestFileResultImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter')
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Webcaptures crawled from web using sandcrawler ingest tool"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestWebResultImporter")
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
@@ -392,91 +414,95 @@ class IngestWebResultImporter(IngestFileResultImporter):
return False
# webcapture-specific filters
- if row['request'].get('ingest_type') != 'html':
- self.counts['skip-ingest-type'] += 1
+ if row["request"].get("ingest_type") != "html":
+ self.counts["skip-ingest-type"] += 1
return False
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+ if not row.get("file_meta"):
+ self.counts["skip-file-meta"] += 1
return False
- if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
- self.counts['skip-mimetype'] += 1
+ if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"):
+ self.counts["skip-mimetype"] += 1
return False
return True
def parse_record(self, row):
- request = row['request']
- file_meta = row['file_meta']
+ request = row["request"]
+ file_meta = row["file_meta"]
# double check that want() filtered request correctly (eg, old requests)
- if request.get('ingest_type') != "html":
- self.counts['skip-ingest-type'] += 1
+ if request.get("ingest_type") != "html":
+ self.counts["skip-ingest-type"] += 1
return None
- if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
- self.counts['skip-mimetype'] += 1
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ self.counts["skip-mimetype"] += 1
return None
# identify release by fatcat ident, or extid lookup
release_ident = self.parse_ingest_release_ident(row)
if not release_ident:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
terminal = self.parse_terminal(row)
if not terminal:
# TODO: support archive.org hits?
- self.counts['skip-no-terminal'] += 1
+ self.counts["skip-no-terminal"] += 1
return None
urls = self.parse_urls(row, terminal)
- archive_urls = [u for u in urls if u.rel == 'webarchive']
+ archive_urls = [u for u in urls if u.rel == "webarchive"]
- if terminal['terminal_status_code'] != 200:
- self.counts['skip-terminal-status-code'] += 1
+ if terminal["terminal_status_code"] != 200:
+ self.counts["skip-terminal-status-code"] += 1
return None
- terminal_cdx = row['cdx']
- if 'revisit_cdx' in row:
- terminal_cdx = row['revisit_cdx']
- assert terminal_cdx['surt']
- if terminal_cdx['url'] != terminal['terminal_url']:
- self.counts['skip-terminal-url-mismatch'] += 1
+ terminal_cdx = row["cdx"]
+ if "revisit_cdx" in row:
+ terminal_cdx = row["revisit_cdx"]
+ assert terminal_cdx["surt"]
+ if terminal_cdx["url"] != terminal["terminal_url"]:
+ self.counts["skip-terminal-url-mismatch"] += 1
return None
wc_cdx = []
# primary resource first
- wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
- surt=terminal_cdx['surt'],
- timestamp=terminal['terminal_timestamp'],
- url=terminal['terminal_url'],
- mimetype=file_meta['mimetype'],
- status_code=terminal['terminal_status_code'],
- sha1=file_meta['sha1hex'],
- sha256=file_meta['sha256hex'],
- size=file_meta['size_bytes'],
- ))
-
- for resource in row.get('html_resources', []):
- timestamp = resource['timestamp']
+ wc_cdx.append(
+ fatcat_openapi_client.WebcaptureCdxLine(
+ surt=terminal_cdx["surt"],
+ timestamp=terminal["terminal_timestamp"],
+ url=terminal["terminal_url"],
+ mimetype=file_meta["mimetype"],
+ status_code=terminal["terminal_status_code"],
+ sha1=file_meta["sha1hex"],
+ sha256=file_meta["sha256hex"],
+ size=file_meta["size_bytes"],
+ )
+ )
+
+ for resource in row.get("html_resources", []):
+ timestamp = resource["timestamp"]
if "+" not in timestamp and "Z" not in timestamp:
timestamp += "Z"
- wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
- surt=resource['surt'],
- timestamp=timestamp,
- url=resource['url'],
- mimetype=resource.get('mimetype'),
- size=resource.get('size'),
- sha1=resource.get('sha1hex'),
- sha256=resource.get('sha256hex'),
- ))
+ wc_cdx.append(
+ fatcat_openapi_client.WebcaptureCdxLine(
+ surt=resource["surt"],
+ timestamp=timestamp,
+ url=resource["url"],
+ mimetype=resource.get("mimetype"),
+ size=resource.get("size"),
+ sha1=resource.get("sha1hex"),
+ sha256=resource.get("sha256hex"),
+ )
+ )
wc = fatcat_openapi_client.WebcaptureEntity(
cdx=wc_cdx,
archive_urls=archive_urls,
- original_url=terminal['terminal_url'],
- timestamp=terminal['terminal_timestamp'],
+ original_url=terminal["terminal_url"],
+ timestamp=terminal["terminal_timestamp"],
release_ids=[release_ident],
)
@@ -491,11 +517,11 @@ class IngestWebResultImporter(IngestFileResultImporter):
# check for existing edits-in-progress with same URL
for other in self._entity_queue:
if other.original_url == wc.original_url:
- self.counts['skip-in-queue'] += 1
+ self.counts["skip-in-queue"] += 1
return False
# lookup sha1, or create new entity (TODO: API doesn't support this yet)
- #existing = None
+ # existing = None
# TODO: currently only allow one release per webcapture
release = self.api.get_release(wc.release_ids[0], expand="webcaptures")
@@ -504,9 +530,9 @@ class IngestWebResultImporter(IngestFileResultImporter):
for other in release.webcaptures:
if wc.original_url == other.original_url:
# TODO: compare very similar timestamps of same time (different formats)
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
- self.counts['skip-release-has-webcapture'] += 1
+ self.counts["skip-release-has-webcapture"] += 1
return False
# Ok, if we got here then no existing web capture for (first) release,
@@ -515,18 +541,24 @@ class IngestWebResultImporter(IngestFileResultImporter):
def insert_batch(self, batch):
if self.submit_mode:
- eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ eg = self.api.create_editgroup(
+ fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
for fe in batch:
self.api.create_webcapture(eg.editgroup_id, fe)
self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
else:
- self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_webcapture_auto_batch(
+ fatcat_openapi_client.WebcaptureAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
+
class SavePaperNowWebImporter(IngestWebResultImporter):
"""
@@ -535,15 +567,15 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
def __init__(self, api, submit_mode=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled after a public 'Save Paper Now' request"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowWebImporter')
- kwargs['submit_mode'] = submit_mode
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Webcaptures crawled after a public 'Save Paper Now' request"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowWebImporter")
+ kwargs["submit_mode"] = submit_mode
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
"""
@@ -553,27 +585,27 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
path, which means allowing hit=false.
"""
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if not source.startswith('savepapernow'):
- self.counts['skip-not-savepapernow'] += 1
+ if not source.startswith("savepapernow"):
+ self.counts["skip-not-savepapernow"] += 1
return False
# webcapture-specific filters
- if row['request'].get('ingest_type') != 'html':
- self.counts['skip-ingest-type'] += 1
+ if row["request"].get("ingest_type") != "html":
+ self.counts["skip-ingest-type"] += 1
return False
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+ if not row.get("file_meta"):
+ self.counts["skip-file-meta"] += 1
return False
- if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
- self.counts['skip-mimetype'] += 1
+ if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"):
+ self.counts["skip-mimetype"] += 1
return False
- if row.get('status') not in ['success', 'unknown-scope']:
- self.counts['skip-hit'] += 1
+ if row.get("status") not in ["success", "unknown-scope"]:
+ self.counts["skip-hit"] += 1
return False
return True
@@ -587,28 +619,28 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Filesets crawled from web using sandcrawler ingest tool"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFilesetResultImporter')
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Filesets crawled from web using sandcrawler ingest tool"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFilesetResultImporter")
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.max_file_count = 300
def want_fileset(self, row):
- if not row.get('manifest') or len(row.get('manifest')) == 0:
- self.counts['skip-empty-manifest'] += 1
+ if not row.get("manifest") or len(row.get("manifest")) == 0:
+ self.counts["skip-empty-manifest"] += 1
return False
- if len(row.get('manifest')) == 1:
- self.counts['skip-single-file'] += 1
+ if len(row.get("manifest")) == 1:
+ self.counts["skip-single-file"] += 1
return False
- if len(row.get('manifest')) > self.max_file_count:
- self.counts['skip-too-many-files'] += 1
+ if len(row.get("manifest")) > self.max_file_count:
+ self.counts["skip-too-many-files"] += 1
return False
return True
@@ -619,8 +651,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return False
# fileset-specific filters
- if row['request'].get('ingest_type') not in ['dataset',]:
- self.counts['skip-ingest-type'] += 1
+ if row["request"].get("ingest_type") not in [
+ "dataset",
+ ]:
+ self.counts["skip-ingest-type"] += 1
return False
if not self.want_fileset(row):
@@ -629,102 +663,118 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return True
def parse_fileset_urls(self, row):
- if not row.get('strategy'):
+ if not row.get("strategy"):
return []
- strategy = row['strategy']
+ strategy = row["strategy"]
urls = []
- if strategy == 'archiveorg-fileset' and row.get('archiveorg_item_name'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
- rel="archive-base",
- ))
- if row['strategy'].startswith('web-') and row.get('platform_base_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
- rel="webarchive-base",
- ))
+ if strategy == "archiveorg-fileset" and row.get("archiveorg_item_name"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
+ rel="archive-base",
+ )
+ )
+ if row["strategy"].startswith("web-") and row.get("platform_base_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
+ rel="webarchive-base",
+ )
+ )
# TODO: repository-base
# TODO: web-base
- if row['strategy'] == 'archiveorg-fileset-bundle' and row.get('archiveorg_item_name'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}",
- rel="archive-bundle",
- ))
+ if row["strategy"] == "archiveorg-fileset-bundle" and row.get("archiveorg_item_name"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}",
+ rel="archive-bundle",
+ )
+ )
- if row['strategy'] == 'web-fileset-bundle' and row.get('platform_bundle_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
- rel="webarchive-bundle",
- ))
+ if row["strategy"] == "web-fileset-bundle" and row.get("platform_bundle_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
+ rel="webarchive-bundle",
+ )
+ )
# add any additional / platform URLs here
- if row.get('platform_bundle_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=row['platform_bundle_url'],
- rel="repository-bundle",
- ))
- if row.get('platform_base_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=row['platform_bundle_url'],
- rel="repository-base",
- ))
+ if row.get("platform_bundle_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=row["platform_bundle_url"],
+ rel="repository-bundle",
+ )
+ )
+ if row.get("platform_base_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=row["platform_bundle_url"],
+ rel="repository-base",
+ )
+ )
return urls
def parse_record(self, row):
- request = row['request']
+ request = row["request"]
# double check that want() filtered request correctly
- if request.get('ingest_type') not in ["dataset",]:
- self.counts['skip-ingest-type'] += 1
+ if request.get("ingest_type") not in [
+ "dataset",
+ ]:
+ self.counts["skip-ingest-type"] += 1
return None
# identify release by fatcat ident, or extid lookup
release_ident = self.parse_ingest_release_ident(row)
if not release_ident:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
entity_extra = dict()
edit_extra = self.parse_edit_extra(row)
- edit_extra['ingest_strategy'] = row['ingest_strategy']
- if row.get('platform'):
- edit_extra['platform'] = row['platform']
- if row.get('platform_id'):
- edit_extra['platform_id'] = row['platform_id']
+ edit_extra["ingest_strategy"] = row["ingest_strategy"]
+ if row.get("platform"):
+ edit_extra["platform"] = row["platform"]
+ if row.get("platform_id"):
+ edit_extra["platform_id"] = row["platform_id"]
entity_urls = self.parse_fileset_urls(row)
if not entity_urls:
- self.counts['skip-no-access-url'] += 1
+ self.counts["skip-no-access-url"] += 1
return None
- assert row['file_count'] == len(row['manifest'])
- if row['file_count'] > self.max_file_count:
- self.counts['skip-too-many-manifest-files'] += 1
+ assert row["file_count"] == len(row["manifest"])
+ if row["file_count"] > self.max_file_count:
+ self.counts["skip-too-many-manifest-files"] += 1
return None
manifest = []
- for ingest_file in row['manifest']:
+ for ingest_file in row["manifest"]:
fsf = fatcat_openapi_client.FilesetFile(
- path=ingest_file['path'],
- size=ingest_file['size'],
- md5=ingest_file['md5'],
- sha1=ingest_file['sha1'],
- sha256=ingest_file.get('sha256'),
+ path=ingest_file["path"],
+ size=ingest_file["size"],
+ md5=ingest_file["md5"],
+ sha1=ingest_file["sha1"],
+ sha256=ingest_file.get("sha256"),
extra=dict(
- mimetype=ingest_file['mimetype'],
+ mimetype=ingest_file["mimetype"],
),
)
if not (fsf.md5 and fsf.sha1 and fsf.path and fsf.size):
- self.counts['skip-partial-file-info'] += 1
+ self.counts["skip-partial-file-info"] += 1
return None
- if ingest_file.get('platform_url'):
+ if ingest_file.get("platform_url"):
# XXX: should we include this?
- fsf.extra['original_url'] = ingest_file['platform_url']
- if ingest_file.get('terminal_url') and ingest_file.get('terminal_dt'):
- fsf.extra['wayback_url'] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}"
+ fsf.extra["original_url"] = ingest_file["platform_url"]
+ if ingest_file.get("terminal_url") and ingest_file.get("terminal_dt"):
+ fsf.extra[
+ "wayback_url"
+ ] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}"
manifest.append(fsf)
fe = fatcat_openapi_client.FilesetEntity(
@@ -745,11 +795,11 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
for other in self._entity_queue:
# XXX: how to duplicate check?
if other.original_url == wc.original_url:
- self.counts['skip-in-queue'] += 1
+ self.counts["skip-in-queue"] += 1
return False
# lookup sha1, or create new entity (TODO: API doesn't support this yet)
- #existing = None
+ # existing = None
# NOTE: in lieu of existing checks (by lookup), only allow one fileset per release
release = self.api.get_release(wc.release_ids[0], expand="filesets")
@@ -759,27 +809,32 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
for other in release.filesets:
if wc.original_url == other.original_url:
# TODO: compare very similar timestamps of same time (different formats)
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
- self.counts['skip-release-has-fileset'] += 1
+ self.counts["skip-release-has-fileset"] += 1
return False
return True
def insert_batch(self, batch):
if self.submit_mode:
- eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ eg = self.api.create_editgroup(
+ fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
for fe in batch:
self.api.create_fileset(eg.editgroup_id, fe)
self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
else:
- self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_fileset_auto_batch(
+ fatcat_openapi_client.FilesetAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
@@ -789,28 +844,28 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
def __init__(self, api, submit_mode=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Fileset crawled after a public 'Save Paper Now' request"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFilesetImporter')
- kwargs['submit_mode'] = submit_mode
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Fileset crawled after a public 'Save Paper Now' request"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFilesetImporter")
+ kwargs["submit_mode"] = submit_mode
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if not source.startswith('savepapernow'):
- self.counts['skip-not-savepapernow'] += 1
+ if not source.startswith("savepapernow"):
+ self.counts["skip-not-savepapernow"] += 1
return False
- if row.get('hit') is not True:
- self.counts['skip-hit'] += 1
+ if row.get("hit") is not True:
+ self.counts["skip-hit"] += 1
return False
if not self.want_fileset(row):
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 0a983c5e..8e3af416 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,4 +1,3 @@
-
import datetime
import sqlite3
import sys
@@ -33,26 +32,24 @@ def parse_jalc_persons(raw_persons):
# first parse out into language-agnostic dics
for raw in raw_persons:
- name = raw.find('name') or None
+ name = raw.find("name") or None
if name:
- name = clean(name.get_text().replace('\n', ' '))
- surname = raw.find('familyName') or None
+ name = clean(name.get_text().replace("\n", " "))
+ surname = raw.find("familyName") or None
if surname:
- surname = clean(surname.get_text().replace('\n', ' '))
- given_name = raw.find('givenName') or None
+ surname = clean(surname.get_text().replace("\n", " "))
+ given_name = raw.find("givenName") or None
if given_name:
- given_name = clean(given_name.get_text().replace('\n', ' '))
- lang = 'en'
+ given_name = clean(given_name.get_text().replace("\n", " "))
+ lang = "en"
if is_cjk(name):
- lang = 'ja'
- if lang == 'en' and surname and given_name:
+ lang = "ja"
+ if lang == "en" and surname and given_name:
# english names order is flipped
name = "{} {}".format(given_name, surname)
rc = fatcat_openapi_client.ReleaseContrib(
- raw_name=name,
- surname=surname,
- given_name=given_name,
- role="author")
+ raw_name=name, surname=surname, given_name=given_name, role="author"
+ )
# add an extra hint field; won't end up in serialized object
rc._lang = lang
persons.append(rc)
@@ -60,12 +57,12 @@ def parse_jalc_persons(raw_persons):
if not persons:
return []
- if all([p._lang == 'en' for p in persons]) or all([p._lang == 'ja' for p in persons]):
+ if all([p._lang == "en" for p in persons]) or all([p._lang == "ja" for p in persons]):
# all english names, or all japanese names
return persons
# for debugging
- #if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):
+ # if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):
# print("INTERESTING: {}".format(persons[0]))
start_lang = persons[0]._lang
@@ -74,10 +71,10 @@ def parse_jalc_persons(raw_persons):
if p._lang == start_lang:
contribs.append(p)
else:
- if p._lang == 'en' and contribs[-1]._lang == 'ja':
+ if p._lang == "en" and contribs[-1]._lang == "ja":
eng = p
jpn = contribs[-1]
- elif p._lang == 'ja' and contribs[-1]._lang == 'en':
+ elif p._lang == "ja" and contribs[-1]._lang == "en":
eng = contribs[-1]
jpn = p
else:
@@ -85,11 +82,11 @@ def parse_jalc_persons(raw_persons):
contribs.append(p)
continue
eng.extra = {
- 'original_name': {
- 'lang': jpn._lang,
- 'raw_name': jpn.raw_name,
- 'given_name': jpn.given_name,
- 'surname': jpn.surname,
+ "original_name": {
+ "lang": jpn._lang,
+ "raw_name": jpn.raw_name,
+ "given_name": jpn.given_name,
+ "surname": jpn.surname,
},
}
contribs[-1] = eng
@@ -105,18 +102,19 @@ class JalcImporter(EntityImporter):
def __init__(self, api, issn_map_file, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of JALC DOI metadata")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter')
- super().__init__(api,
+ eg_desc = kwargs.get("editgroup_description", "Automated import of JALC DOI metadata")
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JalcImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
- self.create_containers = kwargs.get('create_containers', True)
- extid_map_file = kwargs.get('extid_map_file')
+ self.create_containers = kwargs.get("create_containers", True)
+ extid_map_file = kwargs.get("extid_map_file")
self.extid_map_db = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -129,12 +127,27 @@ class JalcImporter(EntityImporter):
def lookup_ext_ids(self, doi):
if self.extid_map_db is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
- [doi.lower()]).fetchone()
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = self.extid_map_db.execute(
+ "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+ ).fetchone()
if row is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = [str(cell or "") or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
@@ -163,27 +176,27 @@ class JalcImporter(EntityImporter):
titles = record.find_all("title")
if not titles:
return None
- title = titles[0].get_text().replace('\n', ' ').strip()
+ title = titles[0].get_text().replace("\n", " ").strip()
original_title = None
- if title.endswith('.'):
+ if title.endswith("."):
title = title[:-1]
if len(titles) > 1:
- original_title = titles[1].get_text().replace('\n', ' ').strip()
- if original_title.endswith('.'):
+ original_title = titles[1].get_text().replace("\n", " ").strip()
+ if original_title.endswith("."):
original_title = original_title[:-1]
doi = None
if record.doi:
doi = clean_doi(record.doi.string.strip().lower())
- if doi.startswith('http://dx.doi.org/'):
- doi = doi.replace('http://dx.doi.org/', '')
- elif doi.startswith('https://dx.doi.org/'):
- doi = doi.replace('https://dx.doi.org/', '')
- elif doi.startswith('http://doi.org/'):
- doi = doi.replace('http://doi.org/', '')
- elif doi.startswith('https://doi.org/'):
- doi = doi.replace('https://doi.org/', '')
- if not (doi.startswith('10.') and '/' in doi):
+ if doi.startswith("http://dx.doi.org/"):
+ doi = doi.replace("http://dx.doi.org/", "")
+ elif doi.startswith("https://dx.doi.org/"):
+ doi = doi.replace("https://dx.doi.org/", "")
+ elif doi.startswith("http://doi.org/"):
+ doi = doi.replace("http://doi.org/", "")
+ elif doi.startswith("https://doi.org/"):
+ doi = doi.replace("https://doi.org/", "")
+ if not (doi.startswith("10.") and "/" in doi):
sys.stderr.write("bogus JALC DOI: {}\n".format(doi))
doi = None
if not doi:
@@ -202,7 +215,9 @@ class JalcImporter(EntityImporter):
if date:
date = date.string
if len(date) == 10:
- release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
+ release_date = datetime.datetime.strptime(
+ date["completed-date"], DATE_FMT
+ ).date()
release_year = release_date.year
release_date = release_date.isoformat()
elif len(date) == 4 and date.isdigit():
@@ -214,7 +229,7 @@ class JalcImporter(EntityImporter):
if record.endingPage and record.endingPage.string.strip():
pages = "{}-{}".format(pages, record.endingPage.string.strip())
# double check to prevent "-" as pages
- if pages and pages.strip() == '-':
+ if pages and pages.strip() == "-":
pages = None
volume = None
@@ -242,9 +257,13 @@ class JalcImporter(EntityImporter):
container_extra = dict()
if record.publicationName:
- pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()]
+ pubs = [
+ p.get_text().replace("\n", " ").strip()
+ for p in record.find_all("publicationName")
+ if p.get_text()
+ ]
pubs = [clean(p) for p in pubs if p]
- assert(pubs)
+ assert pubs
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
if len(pubs) > 1 and is_cjk(pubs[0]):
@@ -252,10 +271,14 @@ class JalcImporter(EntityImporter):
pubs = [pubs[1], pubs[0]]
container_name = clean(pubs[0])
if len(pubs) > 1:
- container_extra['original_name'] = clean(pubs[1])
+ container_extra["original_name"] = clean(pubs[1])
if record.publisher:
- pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()]
+ pubs = [
+ p.get_text().replace("\n", " ").strip()
+ for p in record.find_all("publisher")
+ if p.get_text()
+ ]
pubs = [p for p in pubs if p]
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
@@ -265,20 +288,25 @@ class JalcImporter(EntityImporter):
if pubs:
publisher = clean(pubs[0])
if len(pubs) > 1:
- container_extra['publisher_aliases'] = pubs[1:]
-
- if (container_id is None and self.create_containers and (issnl is not None)
- and container_name):
+ container_extra["publisher_aliases"] = pubs[1:]
+
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and container_name
+ ):
# name, type, publisher, issnl
# extra: issnp, issne, original_name, languages, country
- container_extra['country'] = 'jp'
- container_extra['languages'] = ['ja']
+ container_extra["country"] = "jp"
+ container_extra["languages"] = ["ja"]
ce = fatcat_openapi_client.ContainerEntity(
name=container_name,
- container_type='journal',
+ container_type="journal",
publisher=publisher,
issnl=issnl,
- extra=(container_extra or None))
+ extra=(container_extra or None),
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
# short-cut future imports in same batch
@@ -301,7 +329,7 @@ class JalcImporter(EntityImporter):
# group-title
# always put at least an empty dict here to indicate the DOI registrar
# (informally)
- extra['jalc'] = extra_jalc
+ extra["jalc"] = extra_jalc
title = clean(title)
if not title:
@@ -312,24 +340,24 @@ class JalcImporter(EntityImporter):
title=title,
original_title=clean(original_title),
release_type=release_type,
- release_stage='published',
+ release_stage="published",
release_date=release_date,
release_year=release_year,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids['pmid'],
- pmcid=extids['pmcid'],
- wikidata_qid=extids['wikidata_qid'],
- core=extids['core_id'],
- arxiv=extids['arxiv_id'],
- jstor=extids['jstor_id'],
+ pmid=extids["pmid"],
+ pmcid=extids["pmcid"],
+ wikidata_qid=extids["wikidata_qid"],
+ core=extids["core_id"],
+ arxiv=extids["arxiv_id"],
+ jstor=extids["jstor_id"],
),
volume=volume,
issue=issue,
pages=pages,
publisher=publisher,
language=lang,
- #license_slug
+ # license_slug
container_id=container_id,
contribs=contribs,
extra=extra,
@@ -351,17 +379,20 @@ class JalcImporter(EntityImporter):
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
if existing:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def parse_file(self, handle):
"""
@@ -374,11 +405,11 @@ class JalcImporter(EntityImporter):
# 2. iterate over articles, call parse_article on each
for record in soup.find_all("Description"):
resp = self.parse_record(record)
- #print(json.dumps(resp))
+ # print(json.dumps(resp))
print(resp)
- #sys.exit(-1)
+ # sys.exit(-1)
-if __name__=='__main__':
+if __name__ == "__main__":
parser = JalcImporter(None, None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index 25d7b3b5..6d1fefa3 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from .common import EntityImporter, clean
@@ -11,18 +10,20 @@ def or_none(s):
return None
return s
+
def truthy(s):
if s is None:
return None
s = s.lower()
- if s in ('true', 't', 'yes', 'y', '1'):
+ if s in ("true", "t", "yes", "y", "1"):
return True
- elif s in ('false', 'f', 'no', 'n', '0'):
+ elif s in ("false", "f", "no", "n", "0"):
return False
else:
return None
+
class JournalMetadataImporter(EntityImporter):
"""
Imports journal metadata ("containers") by ISSN, currently from a custom
@@ -33,17 +34,16 @@ class JournalMetadataImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JournalMetadataImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, raw_record):
- if raw_record.get('issnl') and raw_record.get('name'):
+ if raw_record.get("issnl") and raw_record.get("name"):
return True
return False
@@ -54,52 +54,68 @@ class JournalMetadataImporter(EntityImporter):
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
- if not row.get('name'):
+ if not row.get("name"):
# Name is required (by schema)
return None
extra = dict()
- for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev',
- 'coden', 'aliases', 'original_name', 'first_year', 'last_year',
- 'platform', 'default_license', 'road', 'mimetypes',
- 'sherpa_romeo', 'kbart'):
+ for key in (
+ "issne",
+ "issnp",
+ "languages",
+ "country",
+ "urls",
+ "abbrev",
+ "coden",
+ "aliases",
+ "original_name",
+ "first_year",
+ "last_year",
+ "platform",
+ "default_license",
+ "road",
+ "mimetypes",
+ "sherpa_romeo",
+ "kbart",
+ ):
if row.get(key):
extra[key] = row[key]
# TODO: not including for now: norwegian, dois/crossref, ia
extra_doaj = dict()
- if row.get('doaj'):
- if row['doaj'].get('as_of'):
- extra_doaj['as_of'] = row['doaj']['as_of']
- if row['doaj'].get('works'):
- extra_doaj['works'] = row['doaj']['works']
+ if row.get("doaj"):
+ if row["doaj"].get("as_of"):
+ extra_doaj["as_of"] = row["doaj"]["as_of"]
+ if row["doaj"].get("works"):
+ extra_doaj["works"] = row["doaj"]["works"]
if extra_doaj:
- extra['doaj'] = extra_doaj
+ extra["doaj"] = extra_doaj
extra_ia = dict()
# TODO: would like an ia.longtail_ia flag
- if row.get('sim'):
+ if row.get("sim"):
# NB: None case of the .get() here is blech, but othrwise
# extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on
- extra_ia['sim'] = {
- 'year_spans': row['sim'].get('year_spans'),
+ extra_ia["sim"] = {
+ "year_spans": row["sim"].get("year_spans"),
}
if extra_ia:
- extra['ia'] = extra_ia
+ extra["ia"] = extra_ia
- name = clean(row.get('name'))
+ name = clean(row.get("name"))
if not name:
return None
ce = fatcat_openapi_client.ContainerEntity(
- issnl=row['issnl'],
- issne=row.get('issne'),
- issnp=row.get('issnp'),
- container_type=None, # TODO
+ issnl=row["issnl"],
+ issne=row.get("issne"),
+ issnp=row.get("issnp"),
+ container_type=None, # TODO
name=name,
- publisher=clean(row.get('publisher')),
- wikidata_qid=None, # TODO
- extra=extra)
+ publisher=clean(row.get("publisher")),
+ wikidata_qid=None, # TODO
+ extra=extra,
+ )
return ce
def try_update(self, ce):
@@ -118,23 +134,26 @@ class JournalMetadataImporter(EntityImporter):
# for now, only update KBART, and only if there is new content
if not existing.extra:
existing.extra = dict()
- if ce.extra.get('kbart') and (existing.extra.get('kbart') != ce.extra['kbart']):
- if not existing.extra.get('kbart'):
- existing.extra['kbart'] = {}
- existing.extra['kbart'].update(ce.extra['kbart'])
+ if ce.extra.get("kbart") and (existing.extra.get("kbart") != ce.extra["kbart"]):
+ if not existing.extra.get("kbart"):
+ existing.extra["kbart"] = {}
+ existing.extra["kbart"].update(ce.extra["kbart"])
self.api.update_container(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
else:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# if we got this far, it's a bug
raise NotImplementedError
def insert_batch(self, batch):
- self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_container_auto_batch(
+ fatcat_openapi_client.ContainerAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index d37424d6..8c7bfad4 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -1,4 +1,3 @@
-
import datetime
import json
import sys
@@ -12,10 +11,10 @@ from .crossref import CONTAINER_TYPE_MAP
# TODO: more entries?
JSTOR_CONTRIB_MAP = {
- 'author': 'author',
- 'editor': 'editor',
- 'translator': 'translator',
- 'illustrator': 'illustrator',
+ "author": "author",
+ "editor": "editor",
+ "translator": "translator",
+ "illustrator": "illustrator",
}
JSTOR_TYPE_MAP = {
@@ -26,6 +25,7 @@ JSTOR_TYPE_MAP = {
"research-article": "article-journal",
}
+
class JstorImporter(EntityImporter):
"""
Importer for JSTOR bulk XML metadata (eg, from their Early Journals
@@ -34,17 +34,18 @@ class JstorImporter(EntityImporter):
def __init__(self, api, issn_map_file, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of JSTOR XML metadata")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter')
- super().__init__(api,
+ eg_desc = kwargs.get("editgroup_description", "Automated import of JSTOR XML metadata")
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JstorImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
- self.create_containers = kwargs.get('create_containers', True)
+ self.create_containers = kwargs.get("create_containers", True)
self.read_issn_map_file(issn_map_file)
@@ -62,20 +63,22 @@ class JstorImporter(EntityImporter):
extra = dict()
extra_jstor = dict()
- release_type = JSTOR_TYPE_MAP.get(article['article-type'])
+ release_type = JSTOR_TYPE_MAP.get(article["article-type"])
title = article_meta.find("article-title")
if title and title.get_text():
- title = title.get_text().replace('\n', ' ').strip()
+ title = title.get_text().replace("\n", " ").strip()
elif title and not title.get_text():
title = None
- if not title and release_type.startswith('review') and article_meta.product.source:
- title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text())
+ if not title and release_type.startswith("review") and article_meta.product.source:
+ title = "Review: {}".format(
+ article_meta.product.source.replace("\n", " ").get_text()
+ )
if not title:
return None
- if title.endswith('.'):
+ if title.endswith("."):
title = title[:-1]
if "[Abstract]" in title:
@@ -93,12 +96,12 @@ class JstorImporter(EntityImporter):
title = title[1:-1]
# JSTOR journal-id
- journal_ids = [j.string for j in journal_meta.find_all('journal-id')]
+ journal_ids = [j.string for j in journal_meta.find_all("journal-id")]
if journal_ids:
- extra_jstor['journal_ids'] = journal_ids
+ extra_jstor["journal_ids"] = journal_ids
- journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ')
- publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ')
+ journal_title = journal_meta.find("journal-title").get_text().replace("\n", " ")
+ publisher = journal_meta.find("publisher-name").get_text().replace("\n", " ")
issn = journal_meta.find("issn")
if issn:
issn = issn.string
@@ -113,13 +116,18 @@ class JstorImporter(EntityImporter):
container_id = self.lookup_issnl(issnl)
# create container if it doesn't exist
- if (container_id is None and self.create_containers and (issnl is not None)
- and journal_title):
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and journal_title
+ ):
ce = fatcat_openapi_client.ContainerEntity(
issnl=issnl,
publisher=publisher,
container_type=self.map_container_type(release_type),
- name=clean(journal_title, force_xml=True))
+ name=clean(journal_title, force_xml=True),
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
@@ -132,8 +140,8 @@ class JstorImporter(EntityImporter):
if jstor_id:
jstor_id = jstor_id.string.strip()
if not jstor_id and doi:
- assert doi.startswith('10.2307/')
- jstor_id = doi.replace('10.2307/', '')
+ assert doi.startswith("10.2307/")
+ jstor_id = doi.replace("10.2307/", "")
assert jstor_id and int(jstor_id)
contribs = []
@@ -142,13 +150,13 @@ class JstorImporter(EntityImporter):
for c in cgroup.find_all("contrib"):
given = c.find("given-names")
if given:
- given = clean(given.get_text().replace('\n', ' '))
+ given = clean(given.get_text().replace("\n", " "))
surname = c.find("surname")
if surname:
- surname = clean(surname.get_text().replace('\n', ' '))
+ surname = clean(surname.get_text().replace("\n", " "))
raw_name = c.find("string-name")
if raw_name:
- raw_name = clean(raw_name.get_text().replace('\n', ' '))
+ raw_name = clean(raw_name.get_text().replace("\n", " "))
if not raw_name:
if given and surname:
@@ -156,15 +164,17 @@ class JstorImporter(EntityImporter):
elif surname:
raw_name = surname
- role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author'))
- if not role and c.get('contrib-type'):
- sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c['contrib-type']))
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- role=role,
- raw_name=raw_name,
- given_name=given,
- surname=surname,
- ))
+ role = JSTOR_CONTRIB_MAP.get(c.get("contrib-type", "author"))
+ if not role and c.get("contrib-type"):
+ sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c["contrib-type"]))
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ role=role,
+ raw_name=raw_name,
+ given_name=given,
+ surname=surname,
+ )
+ )
for i, contrib in enumerate(contribs):
if contrib.raw_name != "et al.":
@@ -172,14 +182,13 @@ class JstorImporter(EntityImporter):
release_year = None
release_date = None
- pub_date = article_meta.find('pub-date')
+ pub_date = article_meta.find("pub-date")
if pub_date and pub_date.year:
release_year = int(pub_date.year.string)
if pub_date.month and pub_date.day:
release_date = datetime.date(
- release_year,
- int(pub_date.month.string),
- int(pub_date.day.string))
+ release_year, int(pub_date.month.string), int(pub_date.day.string)
+ )
if release_date.day == 1 and release_date.month == 1:
# suspect jan 1st dates get set by JSTOR when actual
# date not known (citation needed), so drop them
@@ -208,10 +217,10 @@ class JstorImporter(EntityImporter):
warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
# JSTOR issue-id
- if article_meta.find('issue-id'):
- issue_id = clean(article_meta.find('issue-id').string)
+ if article_meta.find("issue-id"):
+ issue_id = clean(article_meta.find("issue-id").string)
if issue_id:
- extra_jstor['issue_id'] = issue_id
+ extra_jstor["issue_id"] = issue_id
# everything in JSTOR is published
release_stage = "published"
@@ -225,14 +234,14 @@ class JstorImporter(EntityImporter):
# group-title
# pubmed: retraction refs
if extra_jstor:
- extra['jstor'] = extra_jstor
+ extra["jstor"] = extra_jstor
if not extra:
extra = None
re = fatcat_openapi_client.ReleaseEntity(
- #work_id
+ # work_id
title=title,
- #original_title
+ # original_title
release_type=release_type,
release_stage=release_stage,
release_date=release_date,
@@ -246,21 +255,16 @@ class JstorImporter(EntityImporter):
pages=pages,
publisher=publisher,
language=language,
- #license_slug
-
+ # license_slug
# content, mimetype, lang
- #abstracts=abstracts,
-
+ # abstracts=abstracts,
contribs=contribs,
-
# key, year, container_name, title, locator
# extra: volume, authors, issue, publisher, identifiers
- #refs=refs,
-
+ # refs=refs,
# name, type, publisher, issnl
# extra: issnp, issne, original_name, languages, country
container_id=container_id,
-
extra=extra,
)
return re
@@ -289,12 +293,12 @@ class JstorImporter(EntityImporter):
if existing and existing.ext_ids.jstor:
# don't update if it already has JSTOR ID
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
elif existing:
# but do update if only DOI was set
existing.ext_ids.jstor = re.ext_ids.jstor
- existing.extra['jstor'] = re.extra['jstor']
+ existing.extra["jstor"] = re.extra["jstor"]
# better release_type detection, and some other fields
# TODO: don't do this over-writing in the future? assuming here
# this is a one-time batch import over/extending bootstrap crossref
@@ -304,17 +308,20 @@ class JstorImporter(EntityImporter):
existing.contribs = re.contribs
existing.language = re.language
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def parse_file(self, handle):
@@ -325,8 +332,9 @@ class JstorImporter(EntityImporter):
for article in soup.find_all("article"):
resp = self.parse_record(article)
print(json.dumps(resp))
- #sys.exit(-1)
+ # sys.exit(-1)
+
-if __name__=='__main__':
+if __name__ == "__main__":
parser = JstorImporter(None, None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 09807276..7c2a6a87 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from fatcat_tools.normal import clean_doi
@@ -32,13 +31,13 @@ class MatchedImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Import of large-scale file-to-release match results. Source of metadata varies."
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Import of large-scale file-to-release match results. Source of metadata varies."
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.MatchedImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.default_mimetype = kwargs.get("default_mimetype", None)
@@ -46,14 +45,14 @@ class MatchedImporter(EntityImporter):
return True
def parse_record(self, obj):
- dois = [d.lower() for d in obj.get('dois', [])]
+ dois = [d.lower() for d in obj.get("dois", [])]
# lookup dois
re_list = set()
for doi in dois:
doi = clean_doi(doi)
if not doi:
- self.counts['skip-bad-doi'] += 1
+ self.counts["skip-bad-doi"] += 1
return None
try:
re = self.api.lookup_release(doi=doi)
@@ -62,13 +61,22 @@ class MatchedImporter(EntityImporter):
raise err
re = None
if re is None:
- #print("DOI not found: {}".format(doi))
+ # print("DOI not found: {}".format(doi))
pass
else:
re_list.add(re.ident)
# look up other external ids
- for extid_type in ('arxiv', 'pmid', 'pmcid', 'jstor', 'wikidata_qid', 'core', 'isbn13', 'ark'):
+ for extid_type in (
+ "arxiv",
+ "pmid",
+ "pmcid",
+ "jstor",
+ "wikidata_qid",
+ "core",
+ "isbn13",
+ "ark",
+ ):
extid = obj.get(extid_type)
if extid:
try:
@@ -84,49 +92,47 @@ class MatchedImporter(EntityImporter):
release_ids = list(re_list)
if len(release_ids) == 0:
- self.counts['skip-no-releases'] += 1
+ self.counts["skip-no-releases"] += 1
return None
if len(release_ids) > SANE_MAX_RELEASES:
- self.counts['skip-too-many-releases'] += 1
+ self.counts["skip-too-many-releases"] += 1
return None
# parse URLs and CDX
urls = set()
- for url in obj.get('urls', []):
+ for url in obj.get("urls", []):
url = make_rel_url(url, default_link_rel=self.default_link_rel)
if url is not None:
urls.add(url)
- for cdx in obj.get('cdx', []):
- original = cdx['url']
- if cdx.get('dt'):
- wayback = "https://web.archive.org/web/{}/{}".format(
- cdx['dt'],
- original)
+ for cdx in obj.get("cdx", []):
+ original = cdx["url"]
+ if cdx.get("dt"):
+ wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)
urls.add(("webarchive", wayback))
url = make_rel_url(original, default_link_rel=self.default_link_rel)
if url is not None:
urls.add(url)
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
if len(urls) == 0:
- self.counts['skip-no-urls'] += 1
+ self.counts["skip-no-urls"] += 1
return None
if len(urls) > SANE_MAX_URLS:
- self.counts['skip-too-many-urls'] += 1
+ self.counts["skip-too-many-urls"] += 1
return None
- size = obj.get('size')
+ size = obj.get("size")
if size:
size = int(size)
- mimetype = obj.get('mimetype', self.default_mimetype)
+ mimetype = obj.get("mimetype", self.default_mimetype)
if not mimetype and urls:
- if urls[0].url.endswith('.pdf'):
- mimetype = 'application/pdf'
+ if urls[0].url.endswith(".pdf"):
+ mimetype = "application/pdf"
fe = fatcat_openapi_client.FileEntity(
- md5=obj.get('md5'),
- sha1=obj['sha1'],
- sha256=obj.get('sha256'),
+ md5=obj.get("md5"),
+ sha1=obj["sha1"],
+ sha256=obj.get("sha256"),
size=size,
mimetype=mimetype,
release_ids=release_ids,
@@ -149,28 +155,30 @@ class MatchedImporter(EntityImporter):
combined_release_ids = list(set(fe.release_ids + existing.release_ids))
if set(combined_release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
# no new release matches *and* there are already existing URLs
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# check for edit conflicts
if existing.ident in [e.ident for e in self._edits_inflight]:
- self.counts['skip-update-inflight'] += 1
+ self.counts["skip-update-inflight"] += 1
return False
# minimum viable "existing" URL cleanup to fix dupes and broken links:
# remove 'None' wayback URLs, and set archive.org rel 'archive'
- existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+ existing.urls = [
+ u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url)
+ ]
for i in range(len(existing.urls)):
u = existing.urls[i]
- if u.rel == 'repository' and '://archive.org/download/' in u.url:
- existing.urls[i].rel = 'archive'
+ if u.rel == "repository" and "://archive.org/download/" in u.url:
+ existing.urls[i].rel = "archive"
# special case: if importing *new* from archive.org arxiv collections,
# blow away any existing release_id mappings; this is a direct arxiv_id
# map. This *should* be safe to run in all matched imports.
is_arxiv = False
for u in fe.urls:
- if 'archive.org/download/arxiv' in u.url.lower():
+ if "archive.org/download/arxiv" in u.url.lower():
is_arxiv = True
break
if is_arxiv and fe.release_ids:
@@ -178,14 +186,16 @@ class MatchedImporter(EntityImporter):
# merge the existing into this one and update
existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
- existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+ existing.urls = [
+ fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls
+ ]
if len(existing.urls) > SANE_MAX_URLS:
- self.counts['skip-update-too-many-url'] += 1
+ self.counts["skip-update-too-many-url"] += 1
return None
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
if len(existing.release_ids) > SANE_MAX_RELEASES:
- self.counts['skip-update-too-many-releases'] += 1
+ self.counts["skip-update-too-many-releases"] += 1
return None
existing.mimetype = existing.mimetype or fe.mimetype
existing.size = existing.size or fe.size
@@ -194,12 +204,15 @@ class MatchedImporter(EntityImporter):
existing.sha256 = existing.sha256 or fe.sha256
edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
self._edits_inflight.append(edit)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
def insert_batch(self, batch):
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 3bdd23a1..b514e6e5 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -1,4 +1,3 @@
-
import sys
import fatcat_openapi_client
@@ -8,7 +7,7 @@ from .common import EntityImporter, clean
def value_or_none(e):
if type(e) == dict:
- e = e.get('value')
+ e = e.get("value")
if type(e) == str and len(e) == 0:
e = None
# TODO: this is probably bogus; patched in desperation; remove?
@@ -21,18 +20,17 @@ def value_or_none(e):
return None
return e
-class OrcidImporter(EntityImporter):
+class OrcidImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of ORCID metadata, from official bulk releases.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of ORCID metadata, from official bulk releases.",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.OrcidImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, raw_record):
return True
@@ -43,16 +41,16 @@ class OrcidImporter(EntityImporter):
returns a CreatorEntity
"""
- if 'person' not in obj:
+ if "person" not in obj:
return False
- name = obj['person']['name']
+ name = obj["person"]["name"]
if not name:
return None
extra = None
- given = value_or_none(name.get('given-names'))
- sur = value_or_none(name.get('family-name'))
- display = value_or_none(name.get('credit-name'))
+ given = value_or_none(name.get("given-names"))
+ sur = value_or_none(name.get("family-name"))
+ display = value_or_none(name.get("credit-name"))
if display is None:
# TODO: sorry human beings
if given and sur:
@@ -61,7 +59,7 @@ class OrcidImporter(EntityImporter):
display = sur
elif given:
display = given
- orcid = obj['orcid-identifier']['path']
+ orcid = obj["orcid-identifier"]["path"]
if not self.is_orcid(orcid):
sys.stderr.write("Bad ORCID: {}\n".format(orcid))
return None
@@ -74,7 +72,8 @@ class OrcidImporter(EntityImporter):
given_name=clean(given),
surname=clean(sur),
display_name=display,
- extra=extra)
+ extra=extra,
+ )
return ce
def try_update(self, raw_record):
@@ -88,14 +87,17 @@ class OrcidImporter(EntityImporter):
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
if existing:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
return True
def insert_batch(self, batch):
- self.api.create_creator_auto_batch(fatcat_openapi_client.CreatorAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_creator_auto_batch(
+ fatcat_openapi_client.CreatorAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 00ad54d0..cfdafcf7 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -1,4 +1,3 @@
-
import datetime
import json
import sys
@@ -13,42 +12,42 @@ from .common import LANG_MAP_MARC, EntityImporter, clean
# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
PUBMED_RELEASE_TYPE_MAP = {
- #Adaptive Clinical Trial
+ # Adaptive Clinical Trial
"Address": "speech",
"Autobiography": "book",
- #Bibliography
+ # Bibliography
"Biography": "book",
- #Case Reports
+ # Case Reports
"Classical Article": "article-journal",
- #Clinical Conference
- #Clinical Study
- #Clinical Trial
- #Clinical Trial, Phase I
- #Clinical Trial, Phase II
- #Clinical Trial, Phase III
- #Clinical Trial, Phase IV
- #Clinical Trial Protocol
- #Clinical Trial, Veterinary
- #Collected Works
- #Comparative Study
- #Congress
- #Consensus Development Conference
- #Consensus Development Conference, NIH
- #Controlled Clinical Trial
+ # Clinical Conference
+ # Clinical Study
+ # Clinical Trial
+ # Clinical Trial, Phase I
+ # Clinical Trial, Phase II
+ # Clinical Trial, Phase III
+ # Clinical Trial, Phase IV
+ # Clinical Trial Protocol
+ # Clinical Trial, Veterinary
+ # Collected Works
+ # Comparative Study
+ # Congress
+ # Consensus Development Conference
+ # Consensus Development Conference, NIH
+ # Controlled Clinical Trial
"Dataset": "dataset",
- #Dictionary
- #Directory
- #Duplicate Publication
+ # Dictionary
+ # Directory
+ # Duplicate Publication
"Editorial": "editorial",
- #English Abstract # doesn't indicate that this is abstract-only
- #Equivalence Trial
- #Evaluation Studies
- #Expression of Concern
- #Festschrift
- #Government Document
- #Guideline
+ # English Abstract # doesn't indicate that this is abstract-only
+ # Equivalence Trial
+ # Evaluation Studies
+ # Expression of Concern
+ # Festschrift
+ # Government Document
+ # Guideline
"Historical Article": "article-journal",
- #Interactive Tutorial
+ # Interactive Tutorial
"Interview": "interview",
"Introductory Journal Article": "article-journal",
"Journal Article": "article-journal",
@@ -56,53 +55,65 @@ PUBMED_RELEASE_TYPE_MAP = {
"Legal Case": "legal_case",
"Legislation": "legislation",
"Letter": "letter",
- #Meta-Analysis
- #Multicenter Study
- #News
+ # Meta-Analysis
+ # Multicenter Study
+ # News
"Newspaper Article": "article-newspaper",
- #Observational Study
- #Observational Study, Veterinary
- #Overall
- #Patient Education Handout
- #Periodical Index
- #Personal Narrative
- #Portrait
- #Practice Guideline
- #Pragmatic Clinical Trial
- #Publication Components
- #Publication Formats
- #Publication Type Category
- #Randomized Controlled Trial
- #Research Support, American Recovery and Reinvestment Act
- #Research Support, N.I.H., Extramural
- #Research Support, N.I.H., Intramural
- #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
- #Research Support, U.S. Gov't, P.H.S.
- #Review # in the "literature review" sense, not "product review"
- #Scientific Integrity Review
- #Study Characteristics
- #Support of Research
- #Systematic Review
+ # Observational Study
+ # Observational Study, Veterinary
+ # Overall
+ # Patient Education Handout
+ # Periodical Index
+ # Personal Narrative
+ # Portrait
+ # Practice Guideline
+ # Pragmatic Clinical Trial
+ # Publication Components
+ # Publication Formats
+ # Publication Type Category
+ # Randomized Controlled Trial
+ # Research Support, American Recovery and Reinvestment Act
+ # Research Support, N.I.H., Extramural
+ # Research Support, N.I.H., Intramural
+ # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+ # Research Support, U.S. Gov't, P.H.S.
+ # Review # in the "literature review" sense, not "product review"
+ # Scientific Integrity Review
+ # Study Characteristics
+ # Support of Research
+ # Systematic Review
"Technical Report": "report",
- #Twin Study
- #Validation Studies
- #Video-Audio Media
- #Webcasts
+ # Twin Study
+ # Validation Studies
+ # Video-Audio Media
+ # Webcasts
}
MONTH_ABBR_MAP = {
- "Jan": 1, "01": 1,
- "Feb": 2, "02": 2,
- "Mar": 3, "03": 3,
- "Apr": 4, "04": 4,
- "May": 5, "05": 5,
- "Jun": 6, "06": 6,
- "Jul": 7, "07": 7,
- "Aug": 8, "08": 8,
- "Sep": 9, "09": 9,
- "Oct": 10, "10": 10,
- "Nov": 11, "11": 11,
- "Dec": 12, "12": 12,
+ "Jan": 1,
+ "01": 1,
+ "Feb": 2,
+ "02": 2,
+ "Mar": 3,
+ "03": 3,
+ "Apr": 4,
+ "04": 4,
+ "May": 5,
+ "05": 5,
+ "Jun": 6,
+ "06": 6,
+ "Jul": 7,
+ "07": 7,
+ "Aug": 8,
+ "08": 8,
+ "Sep": 9,
+ "09": 9,
+ "Oct": 10,
+ "10": 10,
+ "Nov": 11,
+ "11": 11,
+ "Dec": 12,
+ "12": 12,
}
# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
@@ -295,11 +306,10 @@ COUNTRY_NAME_MAP = {
"United Kingdom": "gb",
"United States": "us",
"Uruguay": "uy",
-
# Additions from running over large files
"Bosnia and Herzegovina": "ba",
- #"International"
- "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
+ # "International"
+ "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
"Russia (Federation)": "ru",
"Scotland": "gb",
"England": "gb",
@@ -320,18 +330,21 @@ class PubmedImporter(EntityImporter):
def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of PubMed/MEDLINE XML metadata")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter')
- super().__init__(api,
+ eg_desc = kwargs.get(
+ "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata"
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.PubmedImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
self.lookup_refs = lookup_refs
- self.create_containers = kwargs.get('create_containers', True)
+ self.create_containers = kwargs.get("create_containers", True)
self.read_issn_map_file(issn_map_file)
def want(self, obj):
@@ -365,15 +378,15 @@ class PubmedImporter(EntityImporter):
release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
break
if pub_types:
- extra_pubmed['pub_types'] = pub_types
+ extra_pubmed["pub_types"] = pub_types
if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
release_type = "retraction"
retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
if retraction_of:
if retraction_of.RefSource:
- extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
+ extra_pubmed["retraction_of_raw"] = retraction_of.RefSource.string
if retraction_of.PMID:
- extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
+ extra_pubmed["retraction_of_pmid"] = retraction_of.PMID.string
# everything in medline is published
release_stage = "published"
@@ -388,18 +401,18 @@ class PubmedImporter(EntityImporter):
elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"):
withdrawn_status = "concern"
- pages = medline.find('MedlinePgn')
+ pages = medline.find("MedlinePgn")
if pages:
pages = pages.string
- title = medline.Article.ArticleTitle.get_text() # always present
+ title = medline.Article.ArticleTitle.get_text() # always present
if title:
- title = title.replace('\n', ' ')
- if title.endswith('.'):
+ title = title.replace("\n", " ")
+ if title.endswith("."):
title = title[:-1]
# this hides some "special" titles, but the vast majority are
# translations; translations don't always include the original_title
- if title.startswith('[') and title.endswith(']'):
+ if title.startswith("[") and title.endswith("]"):
title = title[1:-1]
else:
# will filter out later
@@ -408,8 +421,8 @@ class PubmedImporter(EntityImporter):
original_title = medline.Article.find("VernacularTitle", recurse=False)
if original_title:
original_title = original_title.get_text() or None
- original_title = original_title.replace('\n', ' ')
- if original_title and original_title.endswith('.'):
+ original_title = original_title.replace("\n", " ")
+ if original_title and original_title.endswith("."):
original_title = original_title[:-1]
if original_title and not title:
@@ -428,7 +441,9 @@ class PubmedImporter(EntityImporter):
else:
language = LANG_MAP_MARC.get(language)
if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):
- warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
+ warnings.warn(
+ "MISSING MARC LANG: {}".format(medline.Article.Language.string)
+ )
### Journal/Issue Metadata
# MedlineJournalInfo is always present
@@ -441,9 +456,9 @@ class PubmedImporter(EntityImporter):
country_name = mji.Country.string.strip()
country_code = COUNTRY_NAME_MAP.get(country_name)
if country_code:
- container_extra['country'] = country_code
+ container_extra["country"] = country_code
elif country_name:
- container_extra['country_name'] = country_name
+ container_extra["country_name"] = country_name
if mji.find("ISSNLinking"):
issnl = mji.ISSNLinking.string
@@ -462,7 +477,7 @@ class PubmedImporter(EntityImporter):
if issnl:
container_id = self.lookup_issnl(issnl)
- pub_date = medline.Article.find('ArticleDate')
+ pub_date = medline.Article.find("ArticleDate")
if not pub_date:
pub_date = journal.PubDate
if not pub_date:
@@ -476,7 +491,8 @@ class PubmedImporter(EntityImporter):
release_date = datetime.date(
release_year,
MONTH_ABBR_MAP[pub_date.Month.string],
- int(pub_date.Day.string))
+ int(pub_date.Day.string),
+ )
release_date = release_date.isoformat()
except ValueError as ve:
print("bad date, skipping: {}".format(ve), file=sys.stderr)
@@ -486,25 +502,35 @@ class PubmedImporter(EntityImporter):
if len(medline_date) >= 4 and medline_date[:4].isdigit():
release_year = int(medline_date[:4])
if release_year < 1300 or release_year > 2040:
- print("bad medline year, skipping: {}".format(release_year), file=sys.stderr)
+ print(
+ "bad medline year, skipping: {}".format(release_year), file=sys.stderr
+ )
release_year = None
else:
- print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)
+ print(
+ "unparsable medline date, skipping: {}".format(medline_date),
+ file=sys.stderr,
+ )
if journal.find("Title"):
container_name = journal.Title.get_text()
- if (container_id is None and self.create_containers and (issnl is not None)
- and container_name):
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and container_name
+ ):
# name, type, publisher, issnl
# extra: original_name, languages, country
ce = fatcat_openapi_client.ContainerEntity(
name=container_name,
- container_type='journal',
- #NOTE: publisher not included
+ container_type="journal",
+ # NOTE: publisher not included
issnl=issnl,
issnp=issnp,
- extra=(container_extra or None))
+ extra=(container_extra or None),
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
@@ -521,8 +547,10 @@ class PubmedImporter(EntityImporter):
# "All abstracts are in English"
abstracts = []
primary_abstract = medline.find("Abstract")
- if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'):
- joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")])
+ if primary_abstract and primary_abstract.AbstractText.get("NlmCategory"):
+ joined = "\n".join(
+ [m.get_text() for m in primary_abstract.find_all("AbstractText")]
+ )
abst = fatcat_openapi_client.ReleaseAbstract(
content=joined,
mimetype="text/plain",
@@ -539,7 +567,7 @@ class PubmedImporter(EntityImporter):
)
if abst.content:
abstracts.append(abst)
- if abstract.find('math'):
+ if abstract.find("math"):
abst = fatcat_openapi_client.ReleaseAbstract(
# strip the <AbstractText> tags
content=str(abstract)[14:-15],
@@ -551,8 +579,8 @@ class PubmedImporter(EntityImporter):
other_abstracts = medline.find_all("OtherAbstract")
for other in other_abstracts:
lang = "en"
- if other.get('Language'):
- lang = LANG_MAP_MARC.get(other['Language'])
+ if other.get("Language"):
+ lang = LANG_MAP_MARC.get(other["Language"])
abst = fatcat_openapi_client.ReleaseAbstract(
content=other.AbstractText.get_text().strip(),
mimetype="text/plain",
@@ -572,15 +600,15 @@ class PubmedImporter(EntityImporter):
surname = None
raw_name = None
if author.ForeName:
- given_name = author.ForeName.get_text().replace('\n', ' ')
+ given_name = author.ForeName.get_text().replace("\n", " ")
if author.LastName:
- surname = author.LastName.get_text().replace('\n', ' ')
+ surname = author.LastName.get_text().replace("\n", " ")
if given_name and surname:
raw_name = "{} {}".format(given_name, surname)
elif surname:
raw_name = surname
if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
- raw_name = author.CollectiveName.get_text().replace('\n', ' ')
+ raw_name = author.CollectiveName.get_text().replace("\n", " ")
contrib_extra = dict()
orcid = author.find("Identifier", Source="ORCID")
if orcid:
@@ -590,7 +618,7 @@ class PubmedImporter(EntityImporter):
orcid = orcid.replace("http://orcid.org/", "")
elif orcid.startswith("https://orcid.org/"):
orcid = orcid.replace("https://orcid.org/", "")
- elif '-' not in orcid:
+ elif "-" not in orcid:
orcid = "{}-{}-{}-{}".format(
orcid[0:4],
orcid[4:8],
@@ -598,27 +626,31 @@ class PubmedImporter(EntityImporter):
orcid[12:16],
)
creator_id = self.lookup_orcid(orcid)
- contrib_extra['orcid'] = orcid
+ contrib_extra["orcid"] = orcid
affiliations = author.find_all("Affiliation")
raw_affiliation = None
if affiliations:
- raw_affiliation = affiliations[0].get_text().replace('\n', ' ')
+ raw_affiliation = affiliations[0].get_text().replace("\n", " ")
if len(affiliations) > 1:
- contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]]
+ contrib_extra["more_affiliations"] = [
+ ra.get_text().replace("\n", " ") for ra in affiliations[1:]
+ ]
if author.find("EqualContrib"):
# TODO: schema for this?
- contrib_extra['equal'] = True
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- raw_name=raw_name,
- given_name=given_name,
- surname=surname,
- role="author",
- raw_affiliation=raw_affiliation,
- creator_id=creator_id,
- extra=contrib_extra,
- ))
-
- if medline.AuthorList['CompleteYN'] == 'N':
+ contrib_extra["equal"] = True
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ raw_name=raw_name,
+ given_name=given_name,
+ surname=surname,
+ role="author",
+ raw_affiliation=raw_affiliation,
+ creator_id=creator_id,
+ extra=contrib_extra,
+ )
+ )
+
+ if medline.AuthorList["CompleteYN"] == "N":
contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al."))
for i, contrib in enumerate(contribs):
@@ -633,7 +665,7 @@ class PubmedImporter(EntityImporter):
# note that Reference always exists within a ReferenceList, but
# that there may be multiple ReferenceList (eg, sometimes one per
# Reference)
- for ref in pubmed.find_all('Reference'):
+ for ref in pubmed.find_all("Reference"):
ref_extra = dict()
ref_doi = ref.find("ArticleId", IdType="doi")
if ref_doi:
@@ -643,22 +675,24 @@ class PubmedImporter(EntityImporter):
ref_pmid = clean_pmid(ref_pmid.string)
ref_release_id = None
if ref_doi:
- ref_extra['doi'] = ref_doi
+ ref_extra["doi"] = ref_doi
if self.lookup_refs:
ref_release_id = self.lookup_doi(ref_doi)
if ref_pmid:
- ref_extra['pmid'] = ref_pmid
+ ref_extra["pmid"] = ref_pmid
if self.lookup_refs:
ref_release_id = self.lookup_pmid(ref_pmid)
ref_raw = ref.Citation
if ref_raw:
- ref_extra['unstructured'] = ref_raw.get_text()
+ ref_extra["unstructured"] = ref_raw.get_text()
if not ref_extra:
ref_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- target_release_id=ref_release_id,
- extra=ref_extra,
- ))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ target_release_id=ref_release_id,
+ extra=ref_extra,
+ )
+ )
if not refs:
refs = None
@@ -669,7 +703,7 @@ class PubmedImporter(EntityImporter):
# group-title
# pubmed: retraction refs
if extra_pubmed:
- extra['pubmed'] = extra_pubmed
+ extra["pubmed"] = extra_pubmed
if not extra:
extra = None
@@ -690,14 +724,14 @@ class PubmedImporter(EntityImporter):
doi=doi,
pmid=pmid,
pmcid=pmcid,
- #isbn13 # never in Article
+ # isbn13 # never in Article
),
volume=volume,
issue=issue,
pages=pages,
- #publisher # not included?
+ # publisher # not included?
language=language,
- #license_slug # not in MEDLINE
+ # license_slug # not in MEDLINE
abstracts=abstracts,
contribs=contribs,
refs=refs,
@@ -725,21 +759,22 @@ class PubmedImporter(EntityImporter):
raise err
if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format(
- existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)
+ existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid
+ )
warnings.warn(warn_str)
- self.counts['warn-pmid-doi-mismatch'] += 1
+ self.counts["warn-pmid-doi-mismatch"] += 1
# don't clobber DOI, but do group together
re.ext_ids.doi = None
re.work_id = existing.work_id
if existing and not self.do_updates:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
# TODO: any other reasons to do an update?
# don't update if it already has PMID
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
elif existing:
# but do update if only DOI was set
@@ -750,12 +785,12 @@ class PubmedImporter(EntityImporter):
existing.container_id = existing.container_id or re.container_id
existing.refs = existing.refs or re.refs
existing.abstracts = existing.abstracts or re.abstracts
- existing.extra['pubmed'] = re.extra['pubmed']
+ existing.extra["pubmed"] = re.extra["pubmed"]
# fix stub titles
if existing.title in [
- "OUP accepted manuscript",
- ]:
+ "OUP accepted manuscript",
+ ]:
existing.title = re.title
existing.original_title = existing.original_title or re.original_title
@@ -770,8 +805,8 @@ class PubmedImporter(EntityImporter):
existing.language = existing.language or re.language
# update subtitle in-place first
- if not existing.subtitle and existing.extra.get('subtitle'):
- subtitle = existing.extra.pop('subtitle')
+ if not existing.subtitle and existing.extra.get("subtitle"):
+ subtitle = existing.extra.pop("subtitle")
if type(subtitle) == list:
subtitle = subtitle[0]
if subtitle:
@@ -781,13 +816,13 @@ class PubmedImporter(EntityImporter):
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
except fatcat_openapi_client.rest.ApiException as err:
# there is a code path where we try to update the same release
# twice in a row; if that happens, just skip
# NOTE: API behavior might change in the future?
if "release_edit_editgroup_id_ident_id_key" in err.body:
- self.counts['skip-update-conflict'] += 1
+ self.counts["skip-update-conflict"] += 1
return False
else:
raise err
@@ -797,11 +832,14 @@ class PubmedImporter(EntityImporter):
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def parse_file(self, handle):
@@ -812,8 +850,9 @@ class PubmedImporter(EntityImporter):
for article in soup.find_all("PubmedArticle"):
resp = self.parse_record(article)
print(json.dumps(resp))
- #sys.exit(-1)
+ # sys.exit(-1)
+
-if __name__=='__main__':
+if __name__ == "__main__":
parser = PubmedImporter(None, None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
index 77205cee..78eeec7a 100644
--- a/python/fatcat_tools/importers/shadow.py
+++ b/python/fatcat_tools/importers/shadow.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from fatcat_tools.normal import clean_doi, clean_isbn13, clean_pmid
@@ -30,25 +29,25 @@ class ShadowLibraryImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Import of 'Shadow Library' file/release matches"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ShadowLibraryImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
def want(self, raw_record):
"""
Only want to import records with complete file-level metadata
"""
- fm = raw_record['file_meta']
- if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
- self.counts['skip-file-meta-incomplete'] += 1
+ fm = raw_record["file_meta"]
+ if not (fm["mimetype"] and fm["md5hex"] and fm["sha256hex"] and fm["size_bytes"]):
+ self.counts["skip-file-meta-incomplete"] += 1
return False
- if fm['mimetype'] != 'application/pdf':
- self.counts['skip-not-pdf'] += 1
+ if fm["mimetype"] != "application/pdf":
+ self.counts["skip-not-pdf"] += 1
return False
return True
@@ -57,23 +56,23 @@ class ShadowLibraryImporter(EntityImporter):
We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
"""
- shadow_corpus = obj['shadow']['shadow_corpus']
+ shadow_corpus = obj["shadow"]["shadow_corpus"]
assert shadow_corpus == shadow_corpus.strip().lower()
- doi = clean_doi(obj['shadow'].get('doi'))
- pmid = clean_pmid(obj['shadow'].get('pmid'))
- isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
- shadow_id = obj['shadow'].get('shadow_id').strip()
+ doi = clean_doi(obj["shadow"].get("doi"))
+ pmid = clean_pmid(obj["shadow"].get("pmid"))
+ isbn13 = clean_isbn13(obj["shadow"].get("isbn13"))
+ shadow_id = obj["shadow"].get("shadow_id").strip()
assert shadow_id
- extra = { '{}_id'.format(shadow_corpus): shadow_id }
- for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ extra = {"{}_id".format(shadow_corpus): shadow_id}
+ for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]:
if not ext_id:
continue
- extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+ extra["{}_{}".format(shadow_corpus, ext_type)] = ext_id
# lookup release via several idents
re = None
- for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]:
if not ext_id:
continue
try:
@@ -86,29 +85,31 @@ class ShadowLibraryImporter(EntityImporter):
break
if not re:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
- release_ids = [re.ident,]
+ release_ids = [
+ re.ident,
+ ]
# parse single CDX into URLs (if exists)
urls = []
- if obj.get('cdx'):
- url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+ if obj.get("cdx"):
+ url = make_rel_url(obj["cdx"]["url"], default_link_rel=self.default_link_rel)
if url is not None:
urls.append(url)
wayback = "https://web.archive.org/web/{}/{}".format(
- obj['cdx']['datetime'],
- obj['cdx']['url'])
+ obj["cdx"]["datetime"], obj["cdx"]["url"]
+ )
urls.append(("webarchive", wayback))
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
fe = fatcat_openapi_client.FileEntity(
- md5=obj['file_meta']['md5hex'],
- sha1=obj['file_meta']['sha1hex'],
- sha256=obj['file_meta']['sha256hex'],
- size=int(obj['file_meta']['size_bytes']),
- mimetype=obj['file_meta']['mimetype'] or None,
+ md5=obj["file_meta"]["md5hex"],
+ sha1=obj["file_meta"]["sha1hex"],
+ sha256=obj["file_meta"]["sha256hex"],
+ size=int(obj["file_meta"]["size_bytes"]),
+ mimetype=obj["file_meta"]["mimetype"] or None,
release_ids=release_ids,
urls=urls,
extra=dict(shadows=extra),
@@ -130,45 +131,50 @@ class ShadowLibraryImporter(EntityImporter):
if not existing.extra:
existing.extra = {}
- if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+ if (
+ existing.extra.get("shadows")
+ and list(fe.extra["shadows"].keys())[0] in existing.extra["shadows"]
+ ):
# already imported from this shadow library; skip
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# check for edit conflicts
if existing.ident in [e.ident for e in self._edits_inflight]:
- self.counts['skip-update-inflight'] += 1
+ self.counts["skip-update-inflight"] += 1
return False
if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
raise Exception("Inflight insert; shouldn't happen")
# minimum viable "existing" URL cleanup to fix dupes and broken links:
# remove 'None' wayback URLs, and set archive.org rel 'archive'
- existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+ existing.urls = [
+ u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url)
+ ]
for i in range(len(existing.urls)):
u = existing.urls[i]
- if u.rel == 'repository' and '://archive.org/download/' in u.url:
- existing.urls[i].rel = 'archive'
- if u.rel == 'social':
- u.rel = 'academicsocial'
+ if u.rel == "repository" and "://archive.org/download/" in u.url:
+ existing.urls[i].rel = "archive"
+ if u.rel == "social":
+ u.rel = "academicsocial"
# merge the existing into this one and update
merged_urls = {}
for u in fe.urls + existing.urls:
merged_urls[u.url] = u
existing.urls = list(merged_urls.values())
- if not existing.extra.get('shadows'):
- existing.extra['shadows'] = fe.extra['shadows']
+ if not existing.extra.get("shadows"):
+ existing.extra["shadows"] = fe.extra["shadows"]
else:
- existing.extra['shadows'].update(fe.extra['shadows'])
+ existing.extra["shadows"].update(fe.extra["shadows"])
# do these "plus ones" because we really want to do these updates when possible
if len(existing.urls) > SANE_MAX_URLS + 1:
- self.counts['skip-update-too-many-url'] += 1
+ self.counts["skip-update-too-many-url"] += 1
return None
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
- self.counts['skip-update-too-many-releases'] += 1
+ self.counts["skip-update-too-many-releases"] += 1
return None
existing.mimetype = existing.mimetype or fe.mimetype
existing.size = existing.size or fe.size
@@ -180,12 +186,15 @@ class ShadowLibraryImporter(EntityImporter):
# group-level de-dupe
edit.sha1 = existing.sha1
self._edits_inflight.append(edit)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
def insert_batch(self, batch):
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index 196f86ff..22fefad3 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -33,22 +33,23 @@ REQ_SESSION = requests.Session()
def parse_wbm_url(url):
"""Takes a wayback machine URL, and returns a tuple:
- (timestamp, datetime, original_url)
+ (timestamp, datetime, original_url)
"""
- chunks = url.split('/')
+ chunks = url.split("/")
assert len(chunks) >= 6
- assert chunks[2] == 'web.archive.org'
- assert chunks[3] == 'web'
- return (chunks[4],
- parse_wbm_timestamp(chunks[4]),
- '/'.join(chunks[5:]))
+ assert chunks[2] == "web.archive.org"
+ assert chunks[3] == "web"
+ return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
+
def test_parse_wbm_url():
u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
assert parse_wbm_url(u) == (
"20010712114837",
datetime.datetime(2001, 7, 12, 11, 48, 37),
- "http://www.dlib.org/dlib/june01/reich/06reich.html")
+ "http://www.dlib.org/dlib/june01/reich/06reich.html",
+ )
+
def parse_wbm_timestamp(timestamp):
"""
@@ -56,7 +57,7 @@ def parse_wbm_timestamp(timestamp):
python datetime object (UTC)
"""
# strip any "im_" or "id_" suffix
- if timestamp.endswith('_'):
+ if timestamp.endswith("_"):
timestamp = timestamp[:-3]
# inflexible; require the full second-precision timestamp
assert len(timestamp) == 14
@@ -66,11 +67,13 @@ def parse_wbm_timestamp(timestamp):
day=int(timestamp[6:8]),
hour=int(timestamp[8:10]),
minute=int(timestamp[10:12]),
- second=int(timestamp[12:14]))
+ second=int(timestamp[12:14]),
+ )
+
def test_parse_wbm_timestamp():
- assert parse_wbm_timestamp("20010712114837") == \
- datetime.datetime(2001, 7, 12, 11, 48, 37)
+ assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
+
def fetch_wbm(url):
resp = REQ_SESSION.get(url)
@@ -78,31 +81,35 @@ def fetch_wbm(url):
assert resp.content
return resp.content
+
def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
sys.stderr.write(embed_url + "\n")
- assert embed_url.startswith('/web/')
- embed_url = embed_url.split('/')
+ assert embed_url.startswith("/web/")
+ embed_url = embed_url.split("/")
timestamp = embed_url[2]
- if timestamp.endswith('_'):
+ if timestamp.endswith("_"):
timestamp = timestamp[:-3]
- url = '/'.join(embed_url[3:])
- #print((timestamp, url))
- resp = REQ_SESSION.get(CDX_API_BASE, params=dict(
- url=url,
- closest=timestamp,
- sort="closest",
- resolveRevisits="true",
- matchType="exact",
- limit=1,
- ))
+ url = "/".join(embed_url[3:])
+ # print((timestamp, url))
+ resp = REQ_SESSION.get(
+ CDX_API_BASE,
+ params=dict(
+ url=url,
+ closest=timestamp,
+ sort="closest",
+ resolveRevisits="true",
+ matchType="exact",
+ limit=1,
+ ),
+ )
resp.raise_for_status()
- #print(resp.url)
+ # print(resp.url)
if resp.content:
- hit = resp.content.decode('utf-8').split('\n')[0]
+ hit = resp.content.decode("utf-8").split("\n")[0]
if cdx_output:
cdx_output.write(hit + "\n")
- cdx = hit.split(' ')
- cdx = [x if (x and x != '-') else None for x in cdx]
+ cdx = hit.split(" ")
+ cdx = [x if (x and x != "-") else None for x in cdx]
webcapture_cdx = WebcaptureCdxLine(
surt=cdx[0],
timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",
@@ -113,9 +120,9 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
sha256=None,
)
if verify_hashes:
- resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format(
- cdx[1], # raw timestamp
- webcapture_cdx.url))
+ resp = REQ_SESSION.get(
+ GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp
+ )
resp.raise_for_status()
assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
@@ -124,47 +131,50 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
else:
return None
+
def wayback_url_to_relative(url):
"""
Wayback URLs can be relative or absolute in rewritten documents. This
function converts any form of rewritten URL to a relative (to
web.archive.org) one, or returns None if it isn't a rewritten URL at all.
"""
- if url.startswith('https://web.archive.org/'):
+ if url.startswith("https://web.archive.org/"):
url = url[23:]
- elif url.startswith('http://web.archive.org/'):
+ elif url.startswith("http://web.archive.org/"):
url = url[22:]
- if url.startswith('/web/'):
+ if url.startswith("/web/"):
return url
else:
return None
+
def extract_embeds(soup):
embeds = set()
# <link href="">
- for tag in soup.find_all('link', href=True):
- if tag['rel'] not in ('stylesheet',):
+ for tag in soup.find_all("link", href=True):
+ if tag["rel"] not in ("stylesheet",):
continue
- url = wayback_url_to_relative(tag['href'])
+ url = wayback_url_to_relative(tag["href"])
if url:
embeds.add(url)
# <img src="">
- for tag in soup.find_all('img', src=True):
- url = wayback_url_to_relative(tag['src'])
+ for tag in soup.find_all("img", src=True):
+ url = wayback_url_to_relative(tag["src"])
if url:
embeds.add(url)
# <script src="">
- for tag in soup.find_all('script', src=True):
- url = wayback_url_to_relative(tag['src'])
+ for tag in soup.find_all("script", src=True):
+ url = wayback_url_to_relative(tag["src"])
if url:
embeds.add(url)
return list(embeds)
+
def static_wayback_webcapture(wayback_url, cdx_output=None):
"""
Given a complete wayback machine capture URL, like:
@@ -177,36 +187,40 @@ def static_wayback_webcapture(wayback_url, cdx_output=None):
wbm_html = fetch_wbm(wayback_url)
raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- #with open(rewritten_path, 'r') as fp:
+ # with open(rewritten_path, 'r') as fp:
# soup = BeautifulSoup(fp, "lxml")
soup = BeautifulSoup(wbm_html, "lxml")
embeds = extract_embeds(soup)
- cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url),
- cdx_output=cdx_output)
+ cdx_obj = lookup_cdx(
+ "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
+ )
cdx_list = [cdx_obj]
for url in embeds:
cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
cdx_list.append(cdx_obj)
- archive_urls = [WebcaptureUrl(
- rel="wayback",
- url="https://web.archive.org/web/",
- )]
+ archive_urls = [
+ WebcaptureUrl(
+ rel="wayback",
+ url="https://web.archive.org/web/",
+ )
+ ]
wc = WebcaptureEntity(
cdx=cdx_list,
timestamp=timestamp.isoformat() + "Z",
original_url=original_url,
archive_urls=archive_urls,
- release_ids=None)
+ release_ids=None,
+ )
return wc
+
def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
"""
Returns a tuple: (editgroup_id, edit). If failed, both are None
"""
raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- git_rev = subprocess.check_output(
- ["git", "describe", "--always"]).strip().decode('utf-8')
+ git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
release = api.get_release(release_id, expand="webcaptures")
@@ -214,37 +228,44 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
for wc in release.webcaptures:
if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
# skipping: already existed
- print("release {} already had webcapture {} {}".format(
- release_id, raw_timestamp, original_url))
+ print(
+ "release {} already had webcapture {} {}".format(
+ release_id, raw_timestamp, original_url
+ )
+ )
return (None, None)
wc = static_wayback_webcapture(wayback_url)
assert len(wc.cdx) >= 1
wc.release_ids = [release_id]
if not editgroup_id:
- eg = api.create_editgroup(Editgroup(
- description="One-off import of static web content from wayback machine",
- extra=dict(
- git_rev=git_rev,
- agent="fatcat_tools.auto_wayback_static")))
+ eg = api.create_editgroup(
+ Editgroup(
+ description="One-off import of static web content from wayback machine",
+ extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
+ )
+ )
editgroup_id = eg.editgroup_id
edit = api.create_webcapture(eg.editgroup_id, wc)
return (editgroup_id, edit)
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--verbose',
- action='store_true',
- help="verbose output")
- parser.add_argument('wayback_url',
- type=str,
- help="URL of wayback capture to extract from")
- parser.add_argument('--json-output',
- type=argparse.FileType('w'), default=sys.stdout,
- help="where to write out webcapture entity (as JSON)")
- parser.add_argument('--cdx-output',
- type=argparse.FileType('w'), default=None,
- help="(optional) file to write out CDX stub")
+ parser.add_argument("--verbose", action="store_true", help="verbose output")
+ parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
+ parser.add_argument(
+ "--json-output",
+ type=argparse.FileType("w"),
+ default=sys.stdout,
+ help="where to write out webcapture entity (as JSON)",
+ )
+ parser.add_argument(
+ "--cdx-output",
+ type=argparse.FileType("w"),
+ default=None,
+ help="(optional) file to write out CDX stub",
+ )
args = parser.parse_args()
@@ -254,5 +275,6 @@ def main():
wc_dict = api_client.sanitize_for_serialization(wc)
print(json.dumps(wc_dict))
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()