diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
commit | 31d1a6a713d177990609767d508209ced19ca396 (patch) | |
tree | a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers | |
parent | 9dc891b8098542bb089c8c47098b60a8beb76a53 (diff) | |
download | fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz fatcat-31d1a6a713d177990609767d508209ced19ca396.zip |
fmt (black): fatcat_tools/
Diffstat (limited to 'python/fatcat_tools/importers')
22 files changed, 2578 insertions, 2115 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index 2b0ff7ec..ae4f9049 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,9 +1,9 @@ - import fatcat_openapi_client from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url -ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' +ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL" + class ArabesqueMatchImporter(EntityImporter): """ @@ -38,17 +38,17 @@ class ArabesqueMatchImporter(EntityImporter): def __init__(self, api, extid_type, require_grobid=True, **kwargs): - eg_desc = kwargs.get('editgroup_description', None) or "Match web crawl files to releases based on identifier/URL seedlist" - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter') - if kwargs.get('crawl_id'): - eg_extra['crawl_id'] = kwargs.get('crawl_id') - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) - assert extid_type in ('doi', 'pmcid', 'pmid') + eg_desc = ( + kwargs.get("editgroup_description", None) + or "Match web crawl files to releases based on identifier/URL seedlist" + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArabesqueMatchImporter") + if kwargs.get("crawl_id"): + eg_extra["crawl_id"] = kwargs.get("crawl_id") + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) + assert extid_type in ("doi", "pmcid", "pmid") self.extid_type = extid_type self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel @@ -60,33 +60,35 @@ class ArabesqueMatchImporter(EntityImporter): print("NOT checking GROBID status column") def want(self, row): - if self.require_grobid and not row['postproc_status'] == "200": + if self.require_grobid and not row["postproc_status"] == "200": return False - if (bool(row['hit']) is True - and row['final_sha1'] - and row['final_timestamp'] - and row['final_timestamp'] != "-" - and len(row['final_timestamp']) == 14 - and row['final_mimetype'] - and bool(row['hit']) is True - and row['identifier']): + if ( + bool(row["hit"]) is True + and row["final_sha1"] + and row["final_timestamp"] + and row["final_timestamp"] != "-" + and len(row["final_timestamp"]) == 14 + and row["final_mimetype"] + and bool(row["hit"]) is True + and row["identifier"] + ): return True else: return False def parse_record(self, row): - extid = row['identifier'].strip() + extid = row["identifier"].strip() # check/cleanup DOI - if self.extid_type == 'doi': + if self.extid_type == "doi": extid = extid.lower() - extid.replace('http://doi.org/', '') - extid.replace('https://doi.org/', '') - if extid.startswith('doi:'): + extid.replace("http://doi.org/", "") + extid.replace("https://doi.org/", "") + if extid.startswith("doi:"): extid = extid[4:] - if not extid.startswith('10.'): - self.counts['skip-extid-invalid'] + if not extid.startswith("10."): + self.counts["skip-extid-invalid"] return None # lookup extid @@ -95,35 +97,35 @@ class ArabesqueMatchImporter(EntityImporter): except fatcat_openapi_client.rest.ApiException as err: if err.status == 404: # bail on 404 (release not in DB) - self.counts['skip-extid-not-found'] += 1 + self.counts["skip-extid-not-found"] += 1 return None elif err.status == 400: - self.counts['skip-extid-invalid'] += 1 + self.counts["skip-extid-invalid"] += 1 return None else: raise err - url = make_rel_url(row['final_url'], self.default_link_rel) + url = make_rel_url(row["final_url"], self.default_link_rel) if not url: - self.counts['skip-url'] += 1 + self.counts["skip-url"] += 1 return None - if not row['final_timestamp']: - self.counts['skip-missing-timestamp'] += 1 + if not row["final_timestamp"]: + self.counts["skip-missing-timestamp"] += 1 return None wayback = "https://web.archive.org/web/{}/{}".format( - row['final_timestamp'], - row['final_url']) + row["final_timestamp"], row["final_url"] + ) urls = [url, ("webarchive", wayback)] urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] if len(urls) > SANE_MAX_URLS: - self.counts['skip-too-many-url'] += 1 + self.counts["skip-too-many-url"] += 1 return None fe = fatcat_openapi_client.FileEntity( - sha1=b32_hex(row['final_sha1']), - mimetype=row['final_mimetype'] or self.default_mimetype, + sha1=b32_hex(row["final_sha1"]), + mimetype=row["final_mimetype"] or self.default_mimetype, release_ids=[re.ident], urls=urls, ) @@ -143,15 +145,15 @@ class ArabesqueMatchImporter(EntityImporter): if (fe.release_ids[0] in existing.release_ids) and existing.urls: # TODO: could still, in theory update with the new URL? - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if not self.do_updates: - self.counts['skip-update-disabled'] += 1 + self.counts["skip-update-disabled"] += 1 return False if existing.ident in [e.ident for e in self._edits_inflight]: - self.counts['skip-update-inflight'] += 1 + self.counts["skip-update-inflight"] += 1 return False # TODO: this code path never gets hit because of the check above @@ -159,28 +161,33 @@ class ArabesqueMatchImporter(EntityImporter): existing_urls = set([u.url for u in existing.urls]) new_urls = set([u.url for u in fe.urls]) if existing_urls.issuperset(new_urls): - self.counts['skip-update-nothing-new'] += 1 + self.counts["skip-update-nothing-new"] += 1 return False # merge the existing into this one and update existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) - existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] + existing.urls = [ + fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls + ] if len(existing.urls) > SANE_MAX_URLS: - self.counts['skip-update-too-many-url'] += 1 + self.counts["skip-update-too-many-url"] += 1 return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES: - self.counts['skip-update-too-many-url'] += 1 + self.counts["skip-update-too-many-url"] += 1 return None existing.mimetype = existing.mimetype or fe.mimetype edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self._edits_inflight.append(edit) - self.counts['update'] += 1 + self.counts["update"] += 1 return False def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index fc429fb0..7a689ed2 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -1,4 +1,3 @@ - import datetime import json import re @@ -13,6 +12,7 @@ from .crossref import lookup_license_slug latex2text = LatexNodes2Text() + def latex_to_text(raw): try: return latex2text.latex_to_text(raw).strip() @@ -21,13 +21,14 @@ def latex_to_text(raw): except IndexError: return raw.strip() + def parse_arxiv_authors(raw): if not raw: return [] - raw = raw.replace('*', '') - if '(' in raw: - raw = re.sub(r'\(.*\)', '', raw) - authors = raw.split(', ') + raw = raw.replace("*", "") + if "(" in raw: + raw = re.sub(r"\(.*\)", "", raw) + authors = raw.split(", ") if authors: last = authors[-1].split(" and ") if len(last) == 2: @@ -39,9 +40,12 @@ def parse_arxiv_authors(raw): authors = [a for a in authors if a] return authors + def test_parse_arxiv_authors(): - assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ + assert parse_arxiv_authors( + "Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an" + ) == [ "Raphael Chetrite", "Shamik Gupta", "Izaak Neri", @@ -63,7 +67,9 @@ def test_parse_arxiv_authors(): "Raphael Chetrite Shamik Gupta", ] - assert parse_arxiv_authors("B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)") == [ + assert parse_arxiv_authors( + "B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)" + ) == [ "B. P. Lanyon", "T. J. Weinhold", "N. K. Langford", @@ -84,17 +90,21 @@ class ArxivRawImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of arxiv metadata via arXivRaw OAI-PMH feed") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter') + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of arxiv metadata via arXivRaw OAI-PMH feed", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArxivRawImporter") # lower batch size, because multiple versions per entry (guessing 2-3 on average?) - batch_size = kwargs.get('edit_batch_size', 50) - super().__init__(api, + batch_size = kwargs.get("edit_batch_size", 50) + super().__init__( + api, editgroup_description=eg_desc, editgroup_extra=eg_extra, batch_size=batch_size, - **kwargs) + **kwargs + ) self._test_override = False def parse_record(self, record): @@ -114,53 +124,56 @@ class ArxivRawImporter(EntityImporter): doi = None if metadata.doi and metadata.doi.string: doi = metadata.doi.string.lower().split()[0].strip() - if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]): + if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None - title = latex_to_text(metadata.title.get_text().replace('\n', ' ')) - authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' ')) - contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)] - - lang = "en" # the vast majority in english + title = latex_to_text(metadata.title.get_text().replace("\n", " ")) + authors = parse_arxiv_authors(metadata.authors.get_text().replace("\n", " ")) + contribs = [ + fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role="author") + for i, a in enumerate(authors) + ] + + lang = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): - comments = metadata.comments.get_text().replace('\n', ' ').strip() - extra_arxiv['comments'] = comments - if 'in french' in comments.lower(): - lang = 'fr' - elif 'in spanish' in comments.lower(): - lang = 'es' - elif 'in portuguese' in comments.lower(): - lang = 'pt' - elif 'in hindi' in comments.lower(): - lang = 'hi' - elif 'in japanese' in comments.lower(): - lang = 'ja' - elif 'in german' in comments.lower(): - lang = 'de' - elif 'simplified chinese' in comments.lower(): - lang = 'zh' - elif 'in russian' in comments.lower(): - lang = 'ru' + comments = metadata.comments.get_text().replace("\n", " ").strip() + extra_arxiv["comments"] = comments + if "in french" in comments.lower(): + lang = "fr" + elif "in spanish" in comments.lower(): + lang = "es" + elif "in portuguese" in comments.lower(): + lang = "pt" + elif "in hindi" in comments.lower(): + lang = "hi" + elif "in japanese" in comments.lower(): + lang = "ja" + elif "in german" in comments.lower(): + lang = "de" + elif "simplified chinese" in comments.lower(): + lang = "zh" + elif "in russian" in comments.lower(): + lang = "ru" # more languages? number = None - if metadata.find('journal-ref') and metadata.find('journal-ref').get_text(): - journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip() - extra_arxiv['journal_ref'] = journal_ref + if metadata.find("journal-ref") and metadata.find("journal-ref").get_text(): + journal_ref = metadata.find("journal-ref").get_text().replace("\n", " ").strip() + extra_arxiv["journal_ref"] = journal_ref if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): release_type = "paper-conference" - if metadata.find('report-no') and metadata.find('report-no').string: - number = metadata.find('report-no').string.strip() + if metadata.find("report-no") and metadata.find("report-no").string: + number = metadata.find("report-no").string.strip() # at least some people plop extra metadata in here. hrmf! - if 'ISSN ' in number or 'ISBN ' in number or len(number.split()) > 2: - extra_arxiv['report-no'] = number + if "ISSN " in number or "ISBN " in number or len(number.split()) > 2: + extra_arxiv["report-no"] = number number = None else: release_type = "report" - if metadata.find('acm-class') and metadata.find('acm-class').string: - extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip() + if metadata.find("acm-class") and metadata.find("acm-class").string: + extra_arxiv["acm_class"] = metadata.find("acm-class").string.strip() if metadata.categories and metadata.categories.get_text(): - extra_arxiv['categories'] = metadata.categories.get_text().split() + extra_arxiv["categories"] = metadata.categories.get_text().split() license_slug = None if metadata.license and metadata.license.get_text(): license_slug = lookup_license_slug(metadata.license.get_text()) @@ -170,21 +183,29 @@ class ArxivRawImporter(EntityImporter): abstracts = [] abst = metadata.abstract.get_text().strip() orig = None - if '-----' in abst: - both = abst.split('-----') + if "-----" in abst: + both = abst.split("-----") abst = both[0].strip() orig = both[1].strip() - if '$' in abst or '{' in abst: + if "$" in abst or "{" in abst: mime = "application/x-latex" abst_plain = latex_to_text(abst) - abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en")) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract( + content=abst_plain, mimetype="text/plain", lang="en" + ) + ) else: mime = "text/plain" - abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en") + ) if orig: - abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime) + ) # indicates that fulltext probably isn't english either - if lang == 'en': + if lang == "en": lang = None # extra: @@ -195,39 +216,43 @@ class ArxivRawImporter(EntityImporter): # container_name # group-title # arxiv: comments, categories, etc - extra_arxiv['base_id'] = base_id - extra['superceded'] = True - extra['arxiv'] = extra_arxiv + extra_arxiv["base_id"] = base_id + extra["superceded"] = True + extra["arxiv"] = extra_arxiv versions = [] - for version in metadata.find_all('version'): - arxiv_id = base_id + version['version'] + for version in metadata.find_all("version"): + arxiv_id = base_id + version["version"] release_date = version.date.string.strip() - release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() + release_date = datetime.datetime.strptime( + release_date, "%a, %d %b %Y %H:%M:%S %Z" + ).date() # TODO: source_type? - versions.append(fatcat_openapi_client.ReleaseEntity( - work_id=None, - title=title, - #original_title - version=version['version'], - release_type=release_type, - release_stage='submitted', - release_date=release_date.isoformat(), - release_year=release_date.year, - ext_ids=fatcat_openapi_client.ReleaseExtIds( - arxiv=arxiv_id, - ), - number=number, - language=lang, - license_slug=license_slug, - abstracts=abstracts, - contribs=contribs, - extra=extra.copy(), - )) + versions.append( + fatcat_openapi_client.ReleaseEntity( + work_id=None, + title=title, + # original_title + version=version["version"], + release_type=release_type, + release_stage="submitted", + release_date=release_date.isoformat(), + release_year=release_date.year, + ext_ids=fatcat_openapi_client.ReleaseExtIds( + arxiv=arxiv_id, + ), + number=number, + language=lang, + license_slug=license_slug, + abstracts=abstracts, + contribs=contribs, + extra=extra.copy(), + ) + ) # TODO: assert that versions are actually in order? assert versions - versions[-1].extra.pop('superceded') + versions[-1].extra.pop("superceded") # only apply DOI to most recent version (HACK) if doi: @@ -306,7 +331,7 @@ class ArxivRawImporter(EntityImporter): for v in versions: if v._existing_work_id: if not v._updated: - self.counts['exists'] += 1 + self.counts["exists"] += 1 continue if not any_work_id and last_edit: # fetch the last inserted release from this group @@ -315,7 +340,7 @@ class ArxivRawImporter(EntityImporter): any_work_id = r.work_id v.work_id = any_work_id last_edit = self.api.create_release(self.get_editgroup_id(), v) - self.counts['insert'] += 1 + self.counts["insert"] += 1 return False @@ -323,12 +348,15 @@ class ArxivRawImporter(EntityImporter): # there is no batch/bezerk mode for arxiv importer, except for testing if self._test_override: for batch in batch_batch: - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) - self.counts['insert'] += len(batch) - 1 + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) + self.counts["insert"] += len(batch) - 1 else: raise NotImplementedError() @@ -341,9 +369,9 @@ class ArxivRawImporter(EntityImporter): for article in soup.find_all("record"): resp = self.parse_record(article) print(json.dumps(resp)) - #sys.exit(-1) + # sys.exit(-1) -if __name__ == '__main__': +if __name__ == "__main__": parser = ArxivRawImporter(None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index 0340f6a3..e9de42fc 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -34,15 +34,15 @@ def single_file(prefix, path): hashlib.sha1(), hashlib.sha256(), ] - with open(full, 'rb') as fp: + with open(full, "rb") as fp: while True: - data = fp.read(2**20) + data = fp.read(2 ** 20) if not data: break for h in hashes: h.update(data) mime = magic.Magic(mime=True).from_file(full) - if mime == 'application/octet-stream': + if mime == "application/octet-stream": # magic apparently isn't that great; try using filename as well guess = mimetypes.guess_type(full)[0] if guess: @@ -54,9 +54,11 @@ def single_file(prefix, path): md5=hashes[0].hexdigest(), sha1=hashes[1].hexdigest(), sha256=hashes[2].hexdigest(), - extra=dict(mimetype=mime)) + extra=dict(mimetype=mime), + ) return fsf + def make_manifest(base_dir): manifest = [] for root, dirs, files in os.walk(base_dir): @@ -70,47 +72,49 @@ def cdl_dash_release(meta, extra=None): if not extra: extra = dict() - assert meta['identifier']['type'] == 'DOI' - doi = meta['identifier']['value'].lower() - assert doi.startswith('10.') + assert meta["identifier"]["type"] == "DOI" + doi = meta["identifier"]["value"].lower() + assert doi.startswith("10.") ark_id = None - for extid in meta.get('alternativeIdentifiers', []): - if extid['value'].startswith('ark:'): - ark_id = extid['value'] + for extid in meta.get("alternativeIdentifiers", []): + if extid["value"].startswith("ark:"): + ark_id = extid["value"] assert ark_id - license_slug = lookup_license_slug(meta['rights']['uri']) + license_slug = lookup_license_slug(meta["rights"]["uri"]) abstracts = [] - for desc in meta['descriptions']: - if desc['type'] == "abstract": - abstracts.append(ReleaseAbstract( - mimetype="text/html", - content=clean(desc['value']))) - #print(abstracts) + for desc in meta["descriptions"]: + if desc["type"] == "abstract": + abstracts.append( + ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) + ) + # print(abstracts) if not abstracts: abstracts = None contribs = [] - for creator in meta['creator']: - contribs.append(ReleaseContrib( - given_name=creator['given'], - surname=creator['family'], - # sorry everybody - raw_name="{} {}".format(creator['given'], creator['family']), - raw_affiliation=creator.get('affiliation'), - role="author", # presumably, for these datasets? - )) + for creator in meta["creator"]: + contribs.append( + ReleaseContrib( + given_name=creator["given"], + surname=creator["family"], + # sorry everybody + raw_name="{} {}".format(creator["given"], creator["family"]), + raw_affiliation=creator.get("affiliation"), + role="author", # presumably, for these datasets? + ) + ) r = ReleaseEntity( ext_ids=ReleaseExtIds( doi=doi, ark=ark_id, ), - title=clean(meta['title'], force_xml=True), - publisher=clean(meta['publisher']), - release_year=int(meta['publicationYear']), + title=clean(meta["title"], force_xml=True), + publisher=clean(meta["publisher"]), + release_year=int(meta["publicationYear"]), release_type="dataset", license_slug=license_slug, contribs=contribs, @@ -119,66 +123,66 @@ def cdl_dash_release(meta, extra=None): ) return r + def make_release_fileset(dat_path): - if dat_path.endswith('/'): + if dat_path.endswith("/"): dat_path = dat_path[:-1] dat_discovery = dat_path extra = dict() assert len(dat_discovery) == 64 - with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp: + with open(dat_path + "/cdl_dash_metadata.json", "r") as fp: meta_dict = json.loads(fp.read()) release = cdl_dash_release(meta_dict) - ark_id = release.extra['ark_id'] + ark_id = release.extra["ark_id"] dash_version = None # really crude XML parse-out - with open(dat_path + "/stash-wrapper.xml", 'r') as fp: + with open(dat_path + "/stash-wrapper.xml", "r") as fp: for line in fp: line = line.strip() if line.startswith("<st:version_number>"): - dash_version = int(line[19:].split('<')[0]) + dash_version = int(line[19:].split("<")[0]) assert dash_version is not None - extra['cdl_dash'] = dict(version=dash_version) - release.extra['cdl_dash'] = dict(version=dash_version) + extra["cdl_dash"] = dict(version=dash_version) + release.extra["cdl_dash"] = dict(version=dash_version) manifest = make_manifest(dat_path + "/files/") bundle_url = dict( url="https://merritt.cdlib.org/u/{}/{}".format( - urllib.parse.quote(ark_id, safe=''), - dash_version), - rel="repo-bundle") + urllib.parse.quote(ark_id, safe=""), dash_version + ), + rel="repo-bundle", + ) repo_url = dict( url="https://merritt.cdlib.org/d/{}/{}/".format( - urllib.parse.quote(ark_id, safe=''), - dash_version), - rel="repo") - dat_url = dict( - url="dat://{}/files/".format(dat_discovery), - rel="dweb") + urllib.parse.quote(ark_id, safe=""), dash_version + ), + rel="repo", + ) + dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb") fs = FilesetEntity( - urls=[bundle_url, repo_url, dat_url], - release_ids=None, - manifest=manifest, - extra=extra) + urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra + ) return (release, fs) + def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None): - git_rev = subprocess.check_output( - ["git", "describe", "--always"]).strip().decode('utf-8') + git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") (release, fileset) = make_release_fileset(dat_path) if not editgroup_id: - eg = api.create_editgroup(Editgroup( - description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", - extra=dict( - git_rev=git_rev, - agent="fatcat_tools.auto_cdl_dash_dat"))) + eg = api.create_editgroup( + Editgroup( + description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", + extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), + ) + ) editgroup_id = eg.editgroup_id if not release_id and release.ext_ids.doi: @@ -201,6 +205,7 @@ def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None): fileset = api.get_fileset(edit.ident) return (editgroup_id, release, fileset) -if __name__=='__main__': + +if __name__ == "__main__": # pass this a discovery key that has been cloned to the local directory print(make_release_fileset(sys.argv[1])) diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 0b634e73..8d2a89b6 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from .common import EntityImporter, clean @@ -15,20 +14,19 @@ class ChoculaImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata from Chocula tool.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of container-level metadata from Chocula tool.", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, raw_record): - if not raw_record.get('ident') and not raw_record.get('_known_issnl'): - self.counts['skip-unknown-new-issnl'] += 1 + if not raw_record.get("ident") and not raw_record.get("_known_issnl"): + self.counts["skip-unknown-new-issnl"] += 1 return False - if raw_record.get('issnl') and raw_record.get('name'): + if raw_record.get("issnl") and raw_record.get("name"): return True return False @@ -39,42 +37,55 @@ class ChoculaImporter(EntityImporter): returns a ContainerEntity (or None if invalid or couldn't parse) """ - name = clean(row.get('name')) + name = clean(row.get("name")) if not name: # Name is required (by schema) return None name = name.strip() - if name.endswith(', Proceedings of the'): - name = "Proceedings of the " + name.split(',')[0] + if name.endswith(", Proceedings of the"): + name = "Proceedings of the " + name.split(",")[0] - if name.endswith('.'): + if name.endswith("."): name = name[:-1] extra = dict() - for k in ('urls', 'webarchive_urls', 'country', - 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages', - 'ia', 'scielo', 'kbart', 'publisher_type', 'platform'): - if row['extra'].get(k): - extra[k] = row['extra'][k] + for k in ( + "urls", + "webarchive_urls", + "country", + "sherpa_romeo", + "ezb", + "szczepanski", + "doaj", + "languages", + "ia", + "scielo", + "kbart", + "publisher_type", + "platform", + ): + if row["extra"].get(k): + extra[k] = row["extra"][k] container_type = None - if 'proceedings' in name.lower(): - container_type = 'proceedings' - elif 'journal ' in name.lower(): - container_type = 'journal' + if "proceedings" in name.lower(): + container_type = "proceedings" + elif "journal " in name.lower(): + container_type = "journal" ce = fatcat_openapi_client.ContainerEntity( - issnl=row['issnl'], - issnp=row['extra'].get('issnp'), - issne=row['extra'].get('issne'), - ident=row['ident'], + issnl=row["issnl"], + issnp=row["extra"].get("issnp"), + issne=row["extra"].get("issne"), + ident=row["ident"], name=name, container_type=container_type, - publisher=clean(row.get('publisher')), - wikidata_qid=row.get('wikidata_qid'), - extra=extra) + publisher=clean(row.get("publisher")), + wikidata_qid=row.get("wikidata_qid"), + extra=extra, + ) return ce def try_update(self, ce): @@ -86,12 +97,12 @@ class ChoculaImporter(EntityImporter): except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err - self.counts['exists'] += 1 - self.counts['exists-not-found'] += 1 + self.counts["exists"] += 1 + self.counts["exists-not-found"] += 1 return False - if existing.state != 'active': - self.counts['exists'] += 1 - self.counts['exists-inactive'] += 1 + if existing.state != "active": + self.counts["exists"] += 1 + self.counts["exists-inactive"] += 1 return False if not existing: @@ -102,8 +113,8 @@ class ChoculaImporter(EntityImporter): if err.status != 404: raise err if existing: - self.counts['exists'] += 1 - self.counts['exists-by-issnl'] += 1 + self.counts["exists"] += 1 + self.counts["exists-by-issnl"] += 1 return False # doesn't exist, always create return True @@ -111,18 +122,22 @@ class ChoculaImporter(EntityImporter): # decide whether to update do_update = False if not self.do_updates: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if not existing.extra: existing.extra = dict() - if ce.extra.get('urls') and set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): + if ce.extra.get("urls") and set(ce.extra.get("urls", [])) != set( + existing.extra.get("urls", []) + ): do_update = True - if ce.extra.get('webarchive_urls') and set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])): + if ce.extra.get("webarchive_urls") and set(ce.extra.get("webarchive_urls", [])) != set( + existing.extra.get("webarchive_urls", []) + ): do_update = True - for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'): + for k in ("ezb", "szczepanski", "publisher_type", "platform"): if ce.extra.get(k) and not existing.extra.get(k): do_update = True - for k in ('kbart', 'ia', 'doaj'): + for k in ("kbart", "ia", "doaj"): # always update these fields if not equal (chocula override) if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k): do_update = True @@ -137,41 +152,53 @@ class ChoculaImporter(EntityImporter): existing.container_type = existing.container_type or ce.container_type existing.issne = existing.issne or ce.issne existing.issnp = existing.issnp or ce.issnp - for k in ('urls', 'webarchive_urls'): + for k in ("urls", "webarchive_urls"): # be conservative about URL updates; don't clobber existing URL lists # may want to make this behavior more sophisticated in the # future, or at least a config flag if ce.extra.get(k) and not existing.extra.get(k): existing.extra[k] = ce.extra.get(k, []) - for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia', - 'scielo', 'kbart', 'publisher_type', 'platform'): + for k in ( + "sherpa_romeo", + "ezb", + "szczepanski", + "doaj", + "ia", + "scielo", + "kbart", + "publisher_type", + "platform", + ): # always update (chocula over-rides) if ce.extra.get(k): existing.extra[k] = ce.extra[k] - for k in ('country',): + for k in ("country",): # only include if not set (don't clobber human edits) if ce.extra.get(k) and not existing.extra.get(k): existing.extra[k] = ce.extra[k] - if ce.extra.get('languages'): - if not existing.extra.get('languages'): - existing.extra['languages'] = ce.extra['languages'] - elif not ce.extra['languages'][0] in existing.extra['languages']: - existing.extra['languages'].append(ce.extra['languages'][0]) + if ce.extra.get("languages"): + if not existing.extra.get("languages"): + existing.extra["languages"] = ce.extra["languages"] + elif not ce.extra["languages"][0] in existing.extra["languages"]: + existing.extra["languages"].append(ce.extra["languages"][0]) self.api.update_container(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 return False else: - self.counts['exists'] += 1 - self.counts['exists-skip-update'] += 1 + self.counts["exists"] += 1 + self.counts["exists-skip-update"] += 1 return False # if we got this far, it's a bug raise NotImplementedError def insert_batch(self, batch): - self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_container_auto_batch( + fatcat_openapi_client.ContainerAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index e33a2012..2639c85a 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -1,4 +1,3 @@ - import csv import datetime import json @@ -34,7 +33,6 @@ SANE_MAX_URLS: int = 100 DOMAIN_REL_MAP: Dict[str, str] = { "archive.org": "archive", # LOCKSS, Portico, DuraSpace, etc would also be "archive" - "arxiv.org": "repository", "babel.hathitrust.org": "repository", "cds.cern.ch": "repository", @@ -53,7 +51,6 @@ DOMAIN_REL_MAP: Dict[str, str] = { "zenodo.org": "repository", "www.biorxiv.org": "repository", "www.medrxiv.org": "repository", - "citeseerx.ist.psu.edu": "aggregator", "publisher-connector.core.ac.uk": "aggregator", "core.ac.uk": "aggregator", @@ -62,7 +59,6 @@ DOMAIN_REL_MAP: Dict[str, str] = { "pdfs.semanticscholar.org": "aggregator", "semanticscholar.org": "aggregator", "www.semanticscholar.org": "aggregator", - "academic.oup.com": "publisher", "cdn.elifesciences.org": "publisher", "cell.com": "publisher", @@ -86,15 +82,14 @@ DOMAIN_REL_MAP: Dict[str, str] = { "ehp.niehs.nih.gov": "publisher", "journals.tsu.ru": "publisher", "www.cogentoa.com": "publisher", - "www.researchgate.net": "academicsocial", "academia.edu": "academicsocial", - "wayback.archive-it.org": "webarchive", "web.archive.org": "webarchive", "archive.is": "webarchive", } + def make_rel_url(raw_url: str, default_link_rel: str = "web"): # this is where we map specific domains to rel types, and also filter out # bad domains, invalid URLs, etc @@ -105,12 +100,17 @@ def make_rel_url(raw_url: str, default_link_rel: str = "web"): break return (rel, raw_url) + def test_make_rel_url(): assert make_rel_url("http://example.com/thing.pdf")[0] == "web" assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans" - assert make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] == "webarchive" + assert ( + make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] + == "webarchive" + ) assert make_rel_url("http://cell.com/thing.pdf")[0] == "publisher" + class EntityImporter: """ Base class for fatcat entity importers. @@ -147,23 +147,26 @@ class EntityImporter: def __init__(self, api, **kwargs): - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['git_rev'] = eg_extra.get('git_rev', - subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["git_rev"] = eg_extra.get( + "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip() + ).decode("utf-8") + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityImporter") self.api = api - self.do_updates = bool(kwargs.get('do_updates', True)) - self.do_fuzzy_match: bool = kwargs.get('do_fuzzy_match', True) - self.bezerk_mode: bool = kwargs.get('bezerk_mode', False) - self.submit_mode: bool = kwargs.get('submit_mode', False) - self.edit_batch_size: int = kwargs.get('edit_batch_size', 100) - self.editgroup_description: Optional[str] = kwargs.get('editgroup_description') + self.do_updates = bool(kwargs.get("do_updates", True)) + self.do_fuzzy_match: bool = kwargs.get("do_fuzzy_match", True) + self.bezerk_mode: bool = kwargs.get("bezerk_mode", False) + self.submit_mode: bool = kwargs.get("submit_mode", False) + self.edit_batch_size: int = kwargs.get("edit_batch_size", 100) + self.editgroup_description: Optional[str] = kwargs.get("editgroup_description") self.editgroup_extra: Optional[Any] = eg_extra - self.es_client = kwargs.get('es_client') + self.es_client = kwargs.get("es_client") if not self.es_client: - self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120) + self.es_client = elasticsearch.Elasticsearch( + "https://search.fatcat.wiki", timeout=120 + ) self._issnl_id_map: Dict[str, Any] = dict() self._orcid_id_map: Dict[str, Any] = dict() @@ -174,7 +177,7 @@ class EntityImporter: self.reset() def reset(self) -> None: - self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + self.counts = Counter({"total": 0, "skip": 0, "insert": 0, "update": 0, "exists": 0}) self._edit_count: int = 0 self._editgroup_id: Optional[str] = None self._entity_queue: List[Any] = [] @@ -184,13 +187,13 @@ class EntityImporter: """ Returns nothing. """ - self.counts['total'] += 1 + self.counts["total"] += 1 if (not raw_record) or (not self.want(raw_record)): - self.counts['skip'] += 1 + self.counts["skip"] += 1 return entity = self.parse_record(raw_record) if not entity: - self.counts['skip'] += 1 + self.counts["skip"] += 1 return if self.bezerk_mode: self.push_entity(entity) @@ -230,7 +233,7 @@ class EntityImporter: if self._entity_queue: self.insert_batch(self._entity_queue) - self.counts['insert'] += len(self._entity_queue) + self.counts["insert"] += len(self._entity_queue) self._entity_queue = [] return self.counts @@ -248,8 +251,9 @@ class EntityImporter: if not self._editgroup_id: eg = self.api.create_editgroup( fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) self._editgroup_id = eg.editgroup_id self._edit_count += edits @@ -257,30 +261,30 @@ class EntityImporter: def create_container(self, entity): eg_id = self.get_editgroup_id() - self.counts['inserted.container'] += 1 + self.counts["inserted.container"] += 1 return self.api.create_container(eg_id, entity) def create_release(self, entity): eg_id = self.get_editgroup_id() - self.counts['inserted.release'] += 1 + self.counts["inserted.release"] += 1 return self.api.create_release(eg_id, entity) def create_file(self, entity): eg_id = self.get_editgroup_id() - self.counts['inserted.file'] += 1 + self.counts["inserted.file"] += 1 return self.api.create_file(eg_id, entity) def updated(self): """ Implementations should call this from try_update() if the update was successful """ - self.counts['update'] += 1 + self.counts["update"] += 1 def push_entity(self, entity): self._entity_queue.append(entity) if len(self._entity_queue) >= self.edit_batch_size: self.insert_batch(self._entity_queue) - self.counts['insert'] += len(self._entity_queue) + self.counts["insert"] += len(self._entity_queue) self._entity_queue = [] def want(self, raw_record: Any) -> bool: @@ -324,7 +328,7 @@ class EntityImporter: # If anything other than a 404 (not found), something is wrong if ae.status != 404: raise ae - self._orcid_id_map[orcid] = creator_id # might be None + self._orcid_id_map[orcid] = creator_id # might be None return creator_id def is_doi(self, doi: str) -> bool: @@ -347,7 +351,7 @@ class EntityImporter: # If anything other than a 404 (not found), something is wrong if ae.status != 404: raise ae - self._doi_id_map[doi] = release_id # might be None + self._doi_id_map[doi] = release_id # might be None return release_id def lookup_pmid(self, pmid: str): @@ -364,11 +368,11 @@ class EntityImporter: # If anything other than a 404 (not found), something is wrong if ae.status != 404: raise ae - self._pmid_id_map[pmid] = release_id # might be None + self._pmid_id_map[pmid] = release_id # might be None return release_id def is_issnl(self, issnl: str) -> bool: - return len(issnl) == 9 and issnl[4] == '-' + return len(issnl) == 9 and issnl[4] == "-" def lookup_issnl(self, issnl: str): """Caches calls to the ISSN-L lookup API endpoint in a local dict""" @@ -382,7 +386,7 @@ class EntityImporter: # If anything other than a 404 (not found), something is wrong if ae.status != 404: raise ae - self._issnl_id_map[issnl] = container_id # might be None + self._issnl_id_map[issnl] = container_id # might be None return container_id def read_issn_map_file(self, issn_map_file): @@ -417,26 +421,26 @@ class EntityImporter: # update old/deprecated 'rel' on URLs for i in range(len(existing.urls)): u = existing.urls[i] - if u.rel == 'repository' and '://archive.org/download/' in u.url: - existing.urls[i].rel = 'archive' - if u.rel == 'social': - u.rel = 'academicsocial' + if u.rel == "repository" and "://archive.org/download/" in u.url: + existing.urls[i].rel = "archive" + if u.rel == "social": + u.rel = "academicsocial" # remove URLs which are near-duplicates redundant_urls = [] all_urls = [u.url for u in existing.urls] - all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url] + all_wayback_urls = [u.url for u in existing.urls if "://web.archive.org/web/" in u.url] for url in all_urls: # https/http redundancy - if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls: + if url.startswith("http://") and url.replace("http://", "https://", 1) in all_urls: redundant_urls.append(url) continue # default HTTP port included and not included - if ':80/' in url and url.replace(':80', '', 1) in all_urls: + if ":80/" in url and url.replace(":80", "", 1) in all_urls: redundant_urls.append(url) continue # partial and complete wayback timestamps - if '://web.archive.org/web/2017/' in url: + if "://web.archive.org/web/2017/" in url: original_url = "/".join(url.split("/")[5:]) assert len(original_url) > 5 for wb_url in all_wayback_urls: @@ -452,7 +456,9 @@ class EntityImporter: def generic_fileset_cleanups(existing): return existing - def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]: + def match_existing_release_fuzzy( + self, release: ReleaseEntity + ) -> Optional[Tuple[str, str, ReleaseEntity]]: """ This helper function uses fuzzycat (and elasticsearch) to look for existing release entities with similar metadata. @@ -488,7 +494,15 @@ class EntityImporter: return None release_dict = entity_to_dict(release, api_client=self.api.api_client) - verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates] + verified = [ + ( + fuzzycat.verify.verify( + release_dict, entity_to_dict(c, api_client=self.api.api_client) + ), + c, + ) + for c in candidates + ] # chose the "closest" match closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] @@ -522,7 +536,6 @@ class RecordPusher: class JsonLinePusher(RecordPusher): - def __init__(self, importer, json_file, **kwargs): self.importer = importer self.json_file = json_file @@ -539,10 +552,9 @@ class JsonLinePusher(RecordPusher): class CsvPusher(RecordPusher): - def __init__(self, importer, csv_file, **kwargs): self.importer = importer - self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ',')) + self.reader = csv.DictReader(csv_file, delimiter=kwargs.get("delimiter", ",")) def run(self): for line in self.reader: @@ -555,7 +567,6 @@ class CsvPusher(RecordPusher): class LinePusher(RecordPusher): - def __init__(self, importer, text_file, **kwargs): self.importer = importer self.text_file = text_file @@ -571,17 +582,15 @@ class LinePusher(RecordPusher): class SqlitePusher(RecordPusher): - def __init__(self, importer, db_file, table_name, where_clause="", **kwargs): self.importer = importer - self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE') + self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE") self.db.row_factory = sqlite3.Row self.table_name = table_name self.where_clause = where_clause def run(self): - cur = self.db.execute("SELECT * FROM {} {};".format( - self.table_name, self.where_clause)) + cur = self.db.execute("SELECT * FROM {} {};".format(self.table_name, self.where_clause)) for row in cur: self.importer.push_record(row) counts = self.importer.finish() @@ -590,7 +599,6 @@ class SqlitePusher(RecordPusher): class Bs4XmlLinesPusher(RecordPusher): - def __init__(self, importer, xml_file, prefix_filter=None, **kwargs): self.importer = importer self.xml_file = xml_file @@ -611,7 +619,6 @@ class Bs4XmlLinesPusher(RecordPusher): class Bs4XmlFilePusher(RecordPusher): - def __init__(self, importer, xml_file, record_tag, **kwargs): self.importer = importer self.xml_file = xml_file @@ -684,7 +691,6 @@ class Bs4XmlLargeFilePusher(RecordPusher): class Bs4XmlFileListPusher(RecordPusher): - def __init__(self, importer, list_file, record_tag, **kwargs): self.importer = importer self.list_file = list_file @@ -695,7 +701,7 @@ class Bs4XmlFileListPusher(RecordPusher): xml_path = xml_path.strip() if not xml_path or xml_path.startswith("#"): continue - with open(xml_path, 'r') as xml_file: + with open(xml_path, "r") as xml_file: soup = BeautifulSoup(xml_file, "xml") for record in soup.find_all(self.record_tag): self.importer.push_record(record) @@ -705,10 +711,12 @@ class Bs4XmlFileListPusher(RecordPusher): print(counts) return counts + class KafkaBs4XmlPusher(RecordPusher): """ Fetch XML for an article from Kafka, parse via Bs4. """ + def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): self.importer = importer self.consumer = make_kafka_consumer( @@ -716,10 +724,10 @@ class KafkaBs4XmlPusher(RecordPusher): kafka_env, topic_suffix, group, - kafka_namespace=kwargs.get('kafka_namespace', 'fatcat') + kafka_namespace=kwargs.get("kafka_namespace", "fatcat"), ) - self.poll_interval = kwargs.get('poll_interval', 5.0) - self.consume_batch_size = kwargs.get('consume_batch_size', 25) + self.poll_interval = kwargs.get("poll_interval", 5.0) + self.consume_batch_size = kwargs.get("consume_batch_size", 25) def run(self): count = 0 @@ -735,16 +743,19 @@ class KafkaBs4XmlPusher(RecordPusher): # outstanding editgroups every 5 minutes, but there is still that # window when editgroups might be hanging (unsubmitted). batch = self.consumer.consume( - num_messages=self.consume_batch_size, - timeout=self.poll_interval) - print("... got {} kafka messages ({}sec poll interval) {}".format( - len(batch), self.poll_interval, self.importer.counts)) + num_messages=self.consume_batch_size, timeout=self.poll_interval + ) + print( + "... got {} kafka messages ({}sec poll interval) {}".format( + len(batch), self.poll_interval, self.importer.counts + ) + ) if not batch: if datetime.datetime.now() - last_push > datetime.timedelta(minutes=5): # it has been some time, so flush any current editgroup self.importer.finish() last_push = datetime.datetime.now() - #print("Flushed any partial import batch: {}".format(self.importer.counts)) + # print("Flushed any partial import batch: {}".format(self.importer.counts)) continue # first check errors on entire batch... for msg in batch: @@ -752,7 +763,7 @@ class KafkaBs4XmlPusher(RecordPusher): raise KafkaException(msg.error()) # ... then process for msg in batch: - soup = BeautifulSoup(msg.value().decode('utf-8'), "xml") + soup = BeautifulSoup(msg.value().decode("utf-8"), "xml") self.importer.push_record(soup) soup.decompose() count += 1 @@ -771,8 +782,8 @@ class KafkaBs4XmlPusher(RecordPusher): self.consumer.close() return counts -class KafkaJsonPusher(RecordPusher): +class KafkaJsonPusher(RecordPusher): def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): self.importer = importer self.consumer = make_kafka_consumer( @@ -780,11 +791,11 @@ class KafkaJsonPusher(RecordPusher): kafka_env, topic_suffix, group, - kafka_namespace=kwargs.get('kafka_namespace', 'fatcat') + kafka_namespace=kwargs.get("kafka_namespace", "fatcat"), ) - self.poll_interval = kwargs.get('poll_interval', 5.0) - self.consume_batch_size = kwargs.get('consume_batch_size', 100) - self.force_flush = kwargs.get('force_flush', False) + self.poll_interval = kwargs.get("poll_interval", 5.0) + self.consume_batch_size = kwargs.get("consume_batch_size", 100) + self.force_flush = kwargs.get("force_flush", False) def run(self): count = 0 @@ -801,10 +812,13 @@ class KafkaJsonPusher(RecordPusher): # outstanding editgroups every 5 minutes, but there is still that # window when editgroups might be hanging (unsubmitted). batch = self.consumer.consume( - num_messages=self.consume_batch_size, - timeout=self.poll_interval) - print("... got {} kafka messages ({}sec poll interval) {}".format( - len(batch), self.poll_interval, self.importer.counts)) + num_messages=self.consume_batch_size, timeout=self.poll_interval + ) + print( + "... got {} kafka messages ({}sec poll interval) {}".format( + len(batch), self.poll_interval, self.importer.counts + ) + ) if self.force_flush: # this flushing happens even if there have been 'push' events # more recently. it is intended for, eg, importers off the @@ -821,7 +835,7 @@ class KafkaJsonPusher(RecordPusher): self.importer.finish() last_push = datetime.datetime.now() last_force_flush = datetime.datetime.now() - #print("Flushed any partial import batch: {}".format(self.importer.counts)) + # print("Flushed any partial import batch: {}".format(self.importer.counts)) continue # first check errors on entire batch... for msg in batch: @@ -829,7 +843,7 @@ class KafkaJsonPusher(RecordPusher): raise KafkaException(msg.error()) # ... then process for msg in batch: - record = json.loads(msg.value().decode('utf-8')) + record = json.loads(msg.value().decode("utf-8")) self.importer.push_record(record) count += 1 if count % 500 == 0: @@ -864,25 +878,25 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) - #print("Kafka consumer commit successful") + # print("Kafka consumer commit successful") pass # previously, using pykafka - #auto_commit_enable=True, - #auto_commit_interval_ms=30000, # 30 seconds + # auto_commit_enable=True, + # auto_commit_interval_ms=30000, # 30 seconds conf = { - 'bootstrap.servers': hosts, - 'group.id': group, - 'on_commit': fail_fast, + "bootstrap.servers": hosts, + "group.id": group, + "on_commit": fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker - 'enable.auto.offset.store': False, - 'enable.auto.commit': True, + "enable.auto.offset.store": False, + "enable.auto.commit": True, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) - 'max.poll.interval.ms': 120000, - 'default.topic.config': { - 'auto.offset.reset': 'latest', + "max.poll.interval.ms": 120000, + "default.topic.config": { + "auto.offset.reset": "latest", }, } @@ -890,13 +904,13 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat for p in partitions: if p.error: raise KafkaException(p.error) - print("Kafka partitions rebalanced: {} / {}".format( - consumer, partitions)) + print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions)) consumer = Consumer(conf) # NOTE: it's actually important that topic_name *not* be bytes (UTF-8 # encoded) - consumer.subscribe([topic_name], + consumer.subscribe( + [topic_name], on_assign=on_rebalance, on_revoke=on_rebalance, ) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fd6936a4..606d4bb1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,4 +1,3 @@ - import datetime import sqlite3 from typing import Any, Dict, Optional @@ -13,30 +12,30 @@ from .common import EntityImporter, clean # Can get a list of Crossref types (with counts) via API: # https://api.crossref.org/works?rows=0&facet=type-name:* CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = { - 'book': 'book', - 'book-chapter': 'chapter', - 'book-part': 'chapter', - 'book-section': 'chapter', - 'component': 'component', - 'dataset': 'dataset', - 'dissertation': 'thesis', - 'edited-book': 'book', - 'journal-article': 'article-journal', - 'monograph': 'book', - 'other': None, - 'peer-review': 'peer_review', - 'posted-content': 'post', - 'proceedings-article': 'paper-conference', - 'reference-book': 'book', - 'reference-entry': 'entry', - 'report': 'report', - 'standard': 'standard', + "book": "book", + "book-chapter": "chapter", + "book-part": "chapter", + "book-section": "chapter", + "component": "component", + "dataset": "dataset", + "dissertation": "thesis", + "edited-book": "book", + "journal-article": "article-journal", + "monograph": "book", + "other": None, + "peer-review": "peer_review", + "posted-content": "post", + "proceedings-article": "paper-conference", + "reference-book": "book", + "reference-entry": "entry", + "report": "report", + "standard": "standard", } CONTAINER_TYPE_MAP: Dict[str, str] = { - 'article-journal': 'journal', - 'paper-conference': 'conference', - 'book': 'book-series', + "article-journal": "journal", + "paper-conference": "conference", + "book": "book-series", } # These are based, informally, on sorting the most popular licenses found in @@ -90,29 +89,41 @@ LICENSE_SLUG_MAP: Dict[str, str] = { "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", } + def lookup_license_slug(raw: str) -> Optional[str]: if not raw: return None - raw = raw.strip().replace('http://', '//').replace('https://', '//') - if 'creativecommons.org' in raw.lower(): + raw = raw.strip().replace("http://", "//").replace("https://", "//") + if "creativecommons.org" in raw.lower(): raw = raw.lower() - raw = raw.replace('/legalcode', '/').replace('/uk', '') - if not raw.endswith('/'): - raw = raw + '/' + raw = raw.replace("/legalcode", "/").replace("/uk", "") + if not raw.endswith("/"): + raw = raw + "/" return LICENSE_SLUG_MAP.get(raw) + def test_lookup_license_slug(): assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" - assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY" - assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0" + assert ( + lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") + == "CC-BY" + ) + assert ( + lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") + == "CC-0" + ) assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" - assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA" + assert ( + lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") + == "CC-BY-NC-SA" + ) assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC" assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None assert lookup_license_slug("") is None assert lookup_license_slug(None) is None + class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. @@ -124,18 +135,22 @@ class CrossrefImporter(EntityImporter): def __init__(self, api, issn_map_file, **kwargs): - eg_desc: Optional[str] = kwargs.get('editgroup_description', - "Automated import of Crossref DOI metadata, harvested from REST API") - eg_extra: Optional[dict] = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter') - super().__init__(api, + eg_desc: Optional[str] = kwargs.get( + "editgroup_description", + "Automated import of Crossref DOI metadata, harvested from REST API", + ) + eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) - self.create_containers: bool = kwargs.get('create_containers', True) - extid_map_file = kwargs.get('extid_map_file') + self.create_containers: bool = kwargs.get("create_containers", True) + extid_map_file = kwargs.get("extid_map_file") self.extid_map_db: Optional[Any] = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -148,12 +163,27 @@ class CrossrefImporter(EntityImporter): def lookup_ext_ids(self, doi: str) -> Optional[Any]: if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", - [doi.lower()]).fetchone() + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = self.extid_map_db.execute( + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] + ).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = [str(cell or '') or None for cell in row] + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = [str(cell or "") or None for cell in row] return dict( core_id=row[0], pmid=row[1], @@ -173,17 +203,17 @@ class CrossrefImporter(EntityImporter): return CONTAINER_TYPE_MAP.get(crossref_type) def want(self, obj: Dict[str, Any]) -> bool: - if not obj.get('title'): - self.counts['skip-blank-title'] += 1 + if not obj.get("title"): + self.counts["skip-blank-title"] += 1 return False # these are pre-registered DOIs before the actual record is ready # title is a list of titles - titles = obj.get('title') + titles = obj.get("title") if titles is not None and titles[0].strip().lower() in [ - "OUP accepted manuscript".lower(), - ]: - self.counts['skip-stub-title'] += 1 + "OUP accepted manuscript".lower(), + ]: + self.counts["skip-stub-title"] += 1 return False # do most of these checks in-line below @@ -197,86 +227,105 @@ class CrossrefImporter(EntityImporter): # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now - if obj.get('type') in (None, 'journal', 'proceedings', - 'standard-series', 'report-series', 'book-series', 'book-set', - 'book-track', 'proceedings-series'): - self.counts['skip-release-type'] += 1 + if obj.get("type") in ( + None, + "journal", + "proceedings", + "standard-series", + "report-series", + "book-series", + "book-set", + "book-track", + "proceedings-series", + ): + self.counts["skip-release-type"] += 1 return None # Do require the 'title' keys to exist, as release entities do - if ('title' not in obj) or (not obj['title']): - self.counts['skip-blank-title'] += 1 + if ("title" not in obj) or (not obj["title"]): + self.counts["skip-blank-title"] += 1 return None - release_type = self.map_release_type(obj['type']) + release_type = self.map_release_type(obj["type"]) # contribs def do_contribs(obj_list, ctype): contribs = [] for i, am in enumerate(obj_list): creator_id = None - if 'ORCID' in am.keys(): - creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) + if "ORCID" in am.keys(): + creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1]) # Sorry humans :( - if am.get('given') and am.get('family'): - raw_name = "{} {}".format(am['given'], am['family']) - elif am.get('family'): - raw_name = am['family'] + if am.get("given") and am.get("family"): + raw_name = "{} {}".format(am["given"], am["family"]) + elif am.get("family"): + raw_name = am["family"] else: # TODO: can end up empty - raw_name = am.get('name') or am.get('given') + raw_name = am.get("name") or am.get("given") extra = dict() if ctype == "author": index = i else: index = None raw_affiliation = None - if am.get('affiliation'): - if len(am.get('affiliation')) > 0: - raw_affiliation = am.get('affiliation')[0]['name'] - if len(am.get('affiliation')) > 1: + if am.get("affiliation"): + if len(am.get("affiliation")) > 0: + raw_affiliation = am.get("affiliation")[0]["name"] + if len(am.get("affiliation")) > 1: # note: affiliation => more_affiliations - extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] - if am.get('sequence') and am.get('sequence') != "additional": - extra['seq'] = clean(am.get('sequence')) + extra["more_affiliations"] = [ + clean(a["name"]) for a in am.get("affiliation")[1:] + ] + if am.get("sequence") and am.get("sequence") != "additional": + extra["seq"] = clean(am.get("sequence")) if not extra: extra = None assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) - contribs.append(fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=index, - raw_name=raw_name, - given_name=clean(am.get('given')), - surname=clean(am.get('family')), - raw_affiliation=clean(raw_affiliation), - role=ctype, - extra=extra)) + contribs.append( + fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=index, + raw_name=raw_name, + given_name=clean(am.get("given")), + surname=clean(am.get("family")), + raw_affiliation=clean(raw_affiliation), + role=ctype, + extra=extra, + ) + ) return contribs - contribs = do_contribs(obj.get('author', []), "author") - contribs.extend(do_contribs(obj.get('editor', []), "editor")) - contribs.extend(do_contribs(obj.get('translator', []), "translator")) + + contribs = do_contribs(obj.get("author", []), "author") + contribs.extend(do_contribs(obj.get("editor", []), "editor")) + contribs.extend(do_contribs(obj.get("translator", []), "translator")) # container - issn = obj.get('ISSN', [None])[0] + issn = obj.get("ISSN", [None])[0] issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) - publisher = clean(obj.get('publisher')) + publisher = clean(obj.get("publisher")) - container_name = obj.get('container-title') + container_name = obj.get("container-title") if container_name: container_name = clean(container_name[0], force_xml=True) if not container_name: container_name = None - if (container_id is None and self.create_containers and (issnl is not None) - and container_name): + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and container_name + ): ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), - name=container_name) + name=container_name, + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id @@ -284,21 +333,21 @@ class CrossrefImporter(EntityImporter): # license slug license_slug = None license_extra = [] - for lic in obj.get('license', []): - if lic['content-version'] not in ('vor', 'unspecified'): + for lic in obj.get("license", []): + if lic["content-version"] not in ("vor", "unspecified"): continue - slug = lookup_license_slug(lic['URL']) + slug = lookup_license_slug(lic["URL"]) if slug: license_slug = slug - if 'start' in lic: - lic['start'] = lic['start']['date-time'] + if "start" in lic: + lic["start"] = lic["start"]["date-time"] license_extra.append(lic) # references refs = [] - for i, rm in enumerate(obj.get('reference', [])): + for i, rm in enumerate(obj.get("reference", [])): try: - year: Optional[int] = int(rm.get('year')) + year: Optional[int] = int(rm.get("year")) # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year is not None: @@ -307,56 +356,78 @@ class CrossrefImporter(EntityImporter): except (TypeError, ValueError): year = None ref_extra: Dict[str, Any] = dict() - key = rm.get('key') - if key and key.startswith(obj['DOI'].upper()): - key = key.replace(obj['DOI'].upper() + "-", '') - key = key.replace(obj['DOI'].upper(), '') - ref_container_name = rm.get('volume-title') + key = rm.get("key") + if key and key.startswith(obj["DOI"].upper()): + key = key.replace(obj["DOI"].upper() + "-", "") + key = key.replace(obj["DOI"].upper(), "") + ref_container_name = rm.get("volume-title") if not ref_container_name: - ref_container_name = rm.get('journal-title') - elif rm.get('journal-title'): - ref_extra['journal-title'] = rm['journal-title'] - if rm.get('DOI'): - ref_extra['doi'] = rm.get('DOI').lower() - author = clean(rm.get('author')) + ref_container_name = rm.get("journal-title") + elif rm.get("journal-title"): + ref_extra["journal-title"] = rm["journal-title"] + if rm.get("DOI"): + ref_extra["doi"] = rm.get("DOI").lower() + author = clean(rm.get("author")) if author: - ref_extra['authors'] = [author] - for k in ('editor', 'edition', 'authority', 'version', 'genre', - 'url', 'event', 'issue', 'volume', 'date', 'accessed_date', - 'issued', 'page', 'medium', 'collection_title', 'chapter_number', - 'unstructured', 'series-title', 'volume-title'): + ref_extra["authors"] = [author] + for k in ( + "editor", + "edition", + "authority", + "version", + "genre", + "url", + "event", + "issue", + "volume", + "date", + "accessed_date", + "issued", + "page", + "medium", + "collection_title", + "chapter_number", + "unstructured", + "series-title", + "volume-title", + ): if clean(rm.get(k)): ref_extra[k] = clean(rm[k]) if not ref_extra: ref_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - index=i, - # doing lookups would be a second import pass - target_release_id=None, - key=key, - year=year, - container_name=clean(ref_container_name), - title=clean(rm.get('article-title')), - locator=clean(rm.get('first-page')), - # TODO: just dump JSON somewhere here? - extra=ref_extra)) + refs.append( + fatcat_openapi_client.ReleaseRef( + index=i, + # doing lookups would be a second import pass + target_release_id=None, + key=key, + year=year, + container_name=clean(ref_container_name), + title=clean(rm.get("article-title")), + locator=clean(rm.get("first-page")), + # TODO: just dump JSON somewhere here? + extra=ref_extra, + ) + ) # abstracts abstracts = [] - abstract = clean(obj.get('abstract')) + abstract = clean(obj.get("abstract")) if abstract and len(abstract) > 10: - abstracts.append(fatcat_openapi_client.ReleaseAbstract( - mimetype="application/xml+jats", - content=abstract)) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract( + mimetype="application/xml+jats", content=abstract + ) + ) # extra fields extra = dict() extra_crossref = dict() # top-level extra keys if not container_id: - if obj.get('container-title'): - extra['container_name'] = container_name - for key in ('group-title'): + if obj.get("container-title"): + extra["container_name"] = container_name + for key in "group-title": val = obj.get(key) if val: if type(val) == list: @@ -368,7 +439,7 @@ class CrossrefImporter(EntityImporter): else: extra[key] = val # crossref-nested extra keys - for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'): + for key in ("subject", "type", "alternative-id", "archive", "funder"): val = obj.get(key) if val: if type(val) == str: @@ -376,46 +447,51 @@ class CrossrefImporter(EntityImporter): else: extra_crossref[key] = val if license_extra: - extra_crossref['license'] = license_extra + extra_crossref["license"] = license_extra - if len(obj['title']) > 1: - aliases = [clean(t) for t in obj['title'][1:]] + if len(obj["title"]) > 1: + aliases = [clean(t) for t in obj["title"][1:]] aliases = [t for t in aliases if t] if aliases: - extra['aliases'] = aliases + extra["aliases"] = aliases # ISBN isbn13 = None - for raw in obj.get('ISBN', []): + for raw in obj.get("ISBN", []): # TODO: convert if not ISBN-13 format if len(raw) == 17: isbn13 = raw break # release status - if obj['type'] in ('journal-article', 'conference-proceeding', 'book', - 'dissertation', 'book-chapter'): + if obj["type"] in ( + "journal-article", + "conference-proceeding", + "book", + "dissertation", + "book-chapter", + ): release_stage = "published" else: # unknown release_stage = None # external identifiers - extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj['DOI'].lower()) + extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) # filter out unreasonably huge releases if len(abstracts) > 100: - self.counts['skip-huge-abstracts'] += 1 + self.counts["skip-huge-abstracts"] += 1 return None if len(contribs) > 2000: - self.counts['skip-huge-contribs'] += 1 + self.counts["skip-huge-contribs"] += 1 return None if len(refs) > 5000: - self.counts['skip-huge-refs'] += 1 + self.counts["skip-huge-refs"] += 1 return None # release date parsing is amazingly complex - raw_date = obj['issued']['date-parts'][0] + raw_date = obj["issued"]["date-parts"][0] if not raw_date or not raw_date[0]: # got some NoneType, even though at least year is supposed to be set release_year = None @@ -429,28 +505,28 @@ class CrossrefImporter(EntityImporter): release_date = None original_title: Optional[str] = None - if obj.get('original-title'): - ot = obj.get('original-title') + if obj.get("original-title"): + ot = obj.get("original-title") if ot is not None: original_title = clean(ot[0], force_xml=True) title: Optional[str] = None - if obj.get('title'): - title = clean(obj.get('title')[0], force_xml=True) + if obj.get("title"): + title = clean(obj.get("title")[0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character - self.counts['skip-blank-title'] += 1 + self.counts["skip-blank-title"] += 1 return None subtitle = None - if obj.get('subtitle'): - subtitle = clean(obj.get('subtitle')[0], force_xml=True) + if obj.get("subtitle"): + subtitle = clean(obj.get("subtitle")[0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character subtitle = None if extra_crossref: - extra['crossref'] = extra_crossref + extra["crossref"] = extra_crossref if not extra: extra = None @@ -466,19 +542,19 @@ class CrossrefImporter(EntityImporter): release_year=release_year, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( - doi=obj['DOI'].lower(), - pmid=extids['pmid'], - pmcid=extids['pmcid'], - wikidata_qid=extids['wikidata_qid'], + doi=obj["DOI"].lower(), + pmid=extids["pmid"], + pmcid=extids["pmcid"], + wikidata_qid=extids["wikidata_qid"], isbn13=isbn13, - core=extids['core_id'], - arxiv=extids['arxiv_id'], - jstor=extids['jstor_id'], + core=extids["core_id"], + arxiv=extids["arxiv_id"], + jstor=extids["jstor_id"], ), - volume=clean(obj.get('volume')), - issue=clean(obj.get('issue')), - pages=clean(obj.get('page')), - language=clean(obj.get('language')), + volume=clean(obj.get("volume")), + issue=clean(obj.get("issue")), + pages=clean(obj.get("page")), + language=clean(obj.get("language")), license_slug=license_slug, extra=extra, abstracts=abstracts, @@ -500,14 +576,17 @@ class CrossrefImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a06c68a4..4c174b0b 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -30,126 +30,130 @@ MAX_ABSTRACT_LENGTH = 2048 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { - 'Journal': 'journal', - 'Series': 'journal', - 'Book Series': 'book-series', + "Journal": "journal", + "Series": "journal", + "Book Series": "book-series", } # The docs/guide should be the canonical home for these mappings; update there # first. Map various datacite type types to CSL-ish types. None means TODO or # remove. DATACITE_TYPE_MAP = { - 'ris': { - 'THES': 'thesis', - 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report) - 'CHAP': 'chapter', - 'FIGURE': 'figure', - 'RPRT': 'report', - 'JOUR': 'article-journal', - 'MPCT': 'motion_picture', - 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset - 'BOOK': 'book', - 'DATA': 'dataset', - 'COMP': 'software', + "ris": { + "THES": "thesis", + "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) + "CHAP": "chapter", + "FIGURE": "figure", + "RPRT": "report", + "JOUR": "article-journal", + "MPCT": "motion_picture", + "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset + "BOOK": "book", + "DATA": "dataset", + "COMP": "software", }, - 'schemaOrg': { - 'Dataset': 'dataset', - 'Book': 'book', - 'ScholarlyArticle': 'article-journal', - 'ImageObject': 'graphic', - 'Collection': None, - 'MediaObject': None, - 'Event': None, - 'SoftwareSourceCode': 'software', - 'Chapter': 'chapter', - 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. - 'PublicationIssue': 'article', - 'AudioObject': None, - 'Thesis': 'thesis', + "schemaOrg": { + "Dataset": "dataset", + "Book": "book", + "ScholarlyArticle": "article-journal", + "ImageObject": "graphic", + "Collection": None, + "MediaObject": None, + "Event": None, + "SoftwareSourceCode": "software", + "Chapter": "chapter", + "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. + "PublicationIssue": "article", + "AudioObject": None, + "Thesis": "thesis", }, - 'citeproc': { - 'article': 'article', - 'article-journal': 'article-journal', - 'article-magazine': 'article-magazine', - 'article-newspaper': 'article-newspaper', - 'bill': 'bill', - 'book': 'book', - 'broadcast': 'broadcast', - 'chapter': 'chapter', - 'dataset': 'dataset', - 'entry-dictionary': 'entry-dictionary', - 'entry-encyclopedia': 'entry-encyclopedia', - 'entry': 'entry', - 'figure': 'figure', - 'graphic': 'graphic', - 'interview': 'interview', - 'legal_case': 'legal_case', - 'legislation': 'legislation', - 'manuscript': 'manuscript', - 'map': 'map', - 'motion_picture': 'motion_picture', - 'musical_score': 'musical_score', - 'pamphlet': 'pamphlet', - 'paper-conference': 'paper-conference', - 'patent': 'patent', - 'personal_communication': 'personal_communication', - 'post': 'post', - 'post-weblog': 'post-weblog', - 'report': 'report', - 'review-book': 'review-book', - 'review': 'review', - 'song': 'song', - 'speech': 'speech', - 'thesis': 'thesis', - 'treaty': 'treaty', - 'webpage': 'webpage', + "citeproc": { + "article": "article", + "article-journal": "article-journal", + "article-magazine": "article-magazine", + "article-newspaper": "article-newspaper", + "bill": "bill", + "book": "book", + "broadcast": "broadcast", + "chapter": "chapter", + "dataset": "dataset", + "entry-dictionary": "entry-dictionary", + "entry-encyclopedia": "entry-encyclopedia", + "entry": "entry", + "figure": "figure", + "graphic": "graphic", + "interview": "interview", + "legal_case": "legal_case", + "legislation": "legislation", + "manuscript": "manuscript", + "map": "map", + "motion_picture": "motion_picture", + "musical_score": "musical_score", + "pamphlet": "pamphlet", + "paper-conference": "paper-conference", + "patent": "patent", + "personal_communication": "personal_communication", + "post": "post", + "post-weblog": "post-weblog", + "report": "report", + "review-book": "review-book", + "review": "review", + "song": "song", + "speech": "speech", + "thesis": "thesis", + "treaty": "treaty", + "webpage": "webpage", }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types - 'bibtex': { - 'phdthesis': 'thesis', - 'inbook': 'chapter', - 'misc': None, - 'article': 'article-journal', - 'book': 'book', + "bibtex": { + "phdthesis": "thesis", + "inbook": "chapter", + "misc": None, + "article": "article-journal", + "book": "book", }, - 'resourceTypeGeneral': { - 'Image': 'graphic', - 'Dataset': 'dataset', - 'PhysicalObject': None, - 'Collection': None, - 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials" - 'Sound': None, - 'InteractiveResource': None, - 'Event': None, - 'Software': 'software', - 'Other': None, - 'Workflow': None, - 'Audiovisual': None, - } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 + "resourceTypeGeneral": { + "Image": "graphic", + "Dataset": "dataset", + "PhysicalObject": None, + "Collection": None, + "Text": None, # "Greyliterature, labnotes, accompanyingmaterials" + "Sound": None, + "InteractiveResource": None, + "Event": None, + "Software": "software", + "Other": None, + "Workflow": None, + "Audiovisual": None, + }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 } # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. DATACITE_UNKNOWN_MARKERS = ( - '(:unac)', # temporarily inaccessible - '(:unal)', # unallowed, suppressed intentionally - '(:unap)', # not applicable, makes no sense - '(:unas)', # value unassigned (e.g., Untitled) - '(:unav)', # value unavailable, possibly unknown - '(:unkn)', # known to be unknown (e.g., Anonymous, Inconnue) - '(:none)', # never had a value, never will - '(:null)', # explicitly and meaningfully empty - '(:tba)', # to be assigned or announced later - '(:etal)', # too numerous to list (et alia) + "(:unac)", # temporarily inaccessible + "(:unal)", # unallowed, suppressed intentionally + "(:unap)", # not applicable, makes no sense + "(:unas)", # value unassigned (e.g., Untitled) + "(:unav)", # value unavailable, possibly unknown + "(:unkn)", # known to be unknown (e.g., Anonymous, Inconnue) + "(:none)", # never had a value, never will + "(:null)", # explicitly and meaningfully empty + "(:tba)", # to be assigned or announced later + "(:etal)", # too numerous to list (et alia) ) # UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking # unknown values. -UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( - 'NA', - 'NN', - 'n.a.', - '[s.n.]', - 'Unknown', -))) +UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union( + set( + ( + "NA", + "NN", + "n.a.", + "[s.n.]", + "Unknown", + ) + ) +) # UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist. UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) @@ -157,8 +161,20 @@ UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi DATACITE_TITLE_SPAM_WORDGROUPS = [ { - "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online', - 'free', 'hd', 'download', 'english', 'subtitle', 'bluray'), + "tokens": ( + "full", + "movies", + "movie", + "watch", + "streaming", + "online", + "free", + "hd", + "download", + "english", + "subtitle", + "bluray", + ), "min": 4, } ] @@ -205,28 +221,25 @@ class DataciteImporter(EntityImporter): """ Importer for datacite records. """ - def __init__(self, - api, - issn_map_file, - debug=False, - insert_log_file=None, - **kwargs): + + def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs): eg_desc = kwargs.get( - 'editgroup_description', - "Automated import of Datacite DOI metadata, harvested from REST API" + "editgroup_description", + "Automated import of Datacite DOI metadata, harvested from REST API", ) - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', - 'fatcat_tools.DataciteImporter') - super().__init__(api, - issn_map_file=issn_map_file, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) - - self.create_containers = kwargs.get('create_containers', True) - extid_map_file = kwargs.get('extid_map_file') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DataciteImporter") + super().__init__( + api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs + ) + + self.create_containers = kwargs.get("create_containers", True) + extid_map_file = kwargs.get("extid_map_file") self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -240,30 +253,34 @@ class DataciteImporter(EntityImporter): self.insert_log_file = insert_log_file self.this_year = datetime.datetime.now().year - print('datacite with debug={}'.format(self.debug), file=sys.stderr) + print("datacite with debug={}".format(self.debug), file=sys.stderr) def lookup_ext_ids(self, doi): """ Return dictionary of identifiers referring to the same things as the given DOI. """ if self.extid_map_db is None: - return dict(core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None) + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", - [doi.lower()]).fetchone() + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] + ).fetchone() if row is None: - return dict(core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None) - row = [str(cell or '') or None for cell in row] + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = [str(cell or "") or None for cell in row] return dict( core_id=row[0], pmid=row[1], @@ -280,22 +297,22 @@ class DataciteImporter(EntityImporter): """ if not obj or not isinstance(obj, dict): return None - if 'attributes' not in obj: + if "attributes" not in obj: return None - attributes = obj['attributes'] - doi = clean_doi(attributes.get('doi', '').lower()) + attributes = obj["attributes"] + doi = clean_doi(attributes.get("doi", "").lower()) if not doi: - print('skipping record without a DOI', file=sys.stderr) + print("skipping record without a DOI", file=sys.stderr) return if not str.isascii(doi): - print('[{}] skipping non-ascii doi for now'.format(doi)) + print("[{}] skipping non-ascii doi for now".format(doi)) return None - creators = attributes.get('creators', []) or [] - contributors = attributes.get('contributors', []) or [] # Much fewer than creators. + creators = attributes.get("creators", []) or [] + contributors = attributes.get("contributors", []) or [] # Much fewer than creators. contribs = self.parse_datacite_creators(creators, doi=doi) @@ -323,7 +340,9 @@ class DataciteImporter(EntityImporter): # Related: https://guide.fatcat.wiki/entity_release.html -- role # (string, of a set): the type of contribution, from a controlled # vocabulary. TODO: vocabulary needs review. - contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) + contribs_extra_contributors = self.parse_datacite_creators( + contributors, set_index=False, doi=doi + ) # Unfortunately, creators and contributors might overlap, refs GH59. for cc in contribs_extra_contributors: @@ -333,17 +352,16 @@ class DataciteImporter(EntityImporter): # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" - titles = attributes.get('titles', []) or [] - title, original_language_title, subtitle = parse_datacite_titles( - titles) + titles = attributes.get("titles", []) or [] + title, original_language_title, subtitle = parse_datacite_titles(titles) if title is None: - print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False title = clean(title) if not title: - print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False # check for blocklisted "spam", e.g. "FULL MOVIE" @@ -367,10 +385,13 @@ class DataciteImporter(EntityImporter): # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". release_date, release_month, release_year = parse_datacite_dates( - attributes.get('dates', [])) + attributes.get("dates", []) + ) # block bogus far-future years/dates - if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + if release_year is not None and ( + release_year > (self.this_year + 5) or release_year < 1000 + ): release_date = None release_month = None release_year = None @@ -378,26 +399,30 @@ class DataciteImporter(EntityImporter): # Some records do not use the "dates" field (e.g. micropub), but: # "attributes.published" or "attributes.publicationYear" if not any((release_date, release_month, release_year)): - release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) + release_date, release_month, release_year = parse_single_date( + attributes.get("publicationYear") + ) if not any((release_date, release_month, release_year)): - release_date, release_month, release_year = parse_single_date(attributes.get('published')) + release_date, release_month, release_year = parse_single_date( + attributes.get("published") + ) if not any((release_date, release_month, release_year)): - print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) + print("[{}] record w/o date: {}".format(doi, obj), file=sys.stderr) # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". - release_stage = 'published' + release_stage = "published" # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, # we might want something else than 'published'. See also: # https://support.datacite.org/docs/doi-states. # Publisher. A few NA values. A few bogus values. - publisher = attributes.get('publisher') + publisher = attributes.get("publisher") - if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): + if publisher in UNKNOWN_MARKERS | set(("Unpublished", "Unknown")): publisher = None release_stage = None if publisher is not None and len(publisher) > 80: @@ -416,24 +441,26 @@ class DataciteImporter(EntityImporter): container_id = None container_name = None - container = attributes.get('container', {}) or {} - if container.get('type') in CONTAINER_TYPE_MAP.keys(): - container_type = CONTAINER_TYPE_MAP.get(container['type']) - if container.get('identifier') and container.get( - 'identifierType') == 'ISSN': - issn = container.get('identifier') + container = attributes.get("container", {}) or {} + if container.get("type") in CONTAINER_TYPE_MAP.keys(): + container_type = CONTAINER_TYPE_MAP.get(container["type"]) + if container.get("identifier") and container.get("identifierType") == "ISSN": + issn = container.get("identifier") if len(issn) == 8: issn = issn[:4] + "-" + issn[4:] issnl = self.issn2issnl(issn) if issnl is not None: container_id = self.lookup_issnl(issnl) - if container_id is None and container.get('title'): - container_name = container.get('title') + if container_id is None and container.get("title"): + container_name = container.get("title") if isinstance(container_name, list): if len(container_name) > 0: - print('[{}] too many container titles: {}'.format(doi, - len(container_name))) + print( + "[{}] too many container titles: {}".format( + doi, len(container_name) + ) + ) container_name = container_name[0] assert isinstance(container_name, str) ce = fatcat_openapi_client.ContainerEntity( @@ -447,21 +474,24 @@ class DataciteImporter(EntityImporter): else: # TODO(martin): factor this out into a testable function. # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 - container_name = container.get('title') + container_name = container.get("title") if isinstance(container_name, list): if len(container_name) > 0: - print('[{}] too many container titles: {}'.format(doi, - len(container_name))) + print( + "[{}] too many container titles: {}".format( + doi, len(container_name) + ) + ) container_name = container_name[0] # Exception: https://www.micropublication.org/, see: !MR24. if container_id is None and container_name is None: - if publisher and publisher.lower().startswith('micropublication'): + if publisher and publisher.lower().startswith("micropublication"): container_name = publisher # Volume and issue. - volume = container.get('volume') - issue = container.get('issue') + volume = container.get("volume") + issue = container.get("issue") if volume: volume = clean(volume) @@ -472,13 +502,13 @@ class DataciteImporter(EntityImporter): # Pages. pages = None - first_page = container.get('firstPage') - last_page = container.get('lastPage') + first_page = container.get("firstPage") + last_page = container.get("lastPage") if first_page and last_page: try: _ = int(first_page) < int(last_page) - pages = '{}-{}'.format(first_page, last_page) + pages = "{}-{}".format(first_page, last_page) except ValueError as err: # noqa: F841 # TODO(martin): This is more debug than info. # print('[{}] {}'.format(doi, err), file=sys.stderr) @@ -491,8 +521,8 @@ class DataciteImporter(EntityImporter): license_slug = None license_extra = [] - for lic in attributes.get('rightsList', []): - slug = lookup_license_slug(lic.get('rightsUri')) + for lic in attributes.get("rightsList", []): + slug = lookup_license_slug(lic.get("rightsUri")) if slug: license_slug = slug license_extra.append(lic) @@ -506,7 +536,7 @@ class DataciteImporter(EntityImporter): # library solves it for you." -- TODO(martin): We need more of these. language = None - value = attributes.get('language', '') or '' + value = attributes.get("language", "") or "" try: language = pycountry.languages.lookup(value).alpha_2 except (LookupError, AttributeError) as err: # noqa: F841 @@ -520,22 +550,22 @@ class DataciteImporter(EntityImporter): # "Other" fields might contain references or related articles (with # DOI). TODO(martin): maybe try to parse out some of those refs. abstracts = [] - descs = attributes.get('descriptions', []) or [] + descs = attributes.get("descriptions", []) or [] for desc in descs: - if not desc.get('descriptionType') == 'Abstract': + if not desc.get("descriptionType") == "Abstract": continue # Description maybe a string, int or list. - text = desc.get('description', '') + text = desc.get("description", "") if not text: continue if isinstance(text, int): - text = '{}'.format(text) + text = "{}".format(text) if isinstance(text, list): try: text = "\n".join(text) except TypeError: - continue # Bail out, if it is not a list of strings. + continue # Bail out, if it is not a list of strings. # Limit length. if len(text) < 10: @@ -548,7 +578,10 @@ class DataciteImporter(EntityImporter): try: lang = langdetect.detect(text) except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: - print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) + print( + "[{}] language detection failed with {} on {}".format(doi, err, text), + file=sys.stderr, + ) abstract_text = clean(text) if not abstract_text: continue @@ -557,7 +590,8 @@ class DataciteImporter(EntityImporter): mimetype="text/plain", content=abstract_text, lang=lang, - )) + ) + ) # References and relations. Datacite include many relation types in # "attributes.relatedIdentifiers[].relationType", e.g. @@ -570,67 +604,76 @@ class DataciteImporter(EntityImporter): # For the moment, we only care about References. refs, ref_index = [], 0 - relIds = attributes.get('relatedIdentifiers', []) or [] + relIds = attributes.get("relatedIdentifiers", []) or [] for rel in relIds: - if not rel.get('relationType', '') in ('References', 'Cites'): + if not rel.get("relationType", "") in ("References", "Cites"): continue ref_extra = dict() - if rel.get('relatedIdentifierType', '') == 'DOI': - ref_extra['doi'] = rel.get('relatedIdentifier') + if rel.get("relatedIdentifierType", "") == "DOI": + ref_extra["doi"] = rel.get("relatedIdentifier") if not ref_extra: ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=ref_index, extra=ref_extra, - )) + ) + ) ref_index += 1 # More specific release_type via 'Reviews' relationsship. for rel in relIds: - if rel.get('relatedIdentifierType', '') != 'Reviews': + if rel.get("relatedIdentifierType", "") != "Reviews": continue - release_type = 'review' + release_type = "review" # Extra information. extra_datacite = dict() if license_extra: - extra_datacite['license'] = license_extra - if attributes.get('subjects'): - extra_datacite['subjects'] = attributes['subjects'] + extra_datacite["license"] = license_extra + if attributes.get("subjects"): + extra_datacite["subjects"] = attributes["subjects"] # Include version information. - metadata_version = attributes.get('metadataVersion') or '' + metadata_version = attributes.get("metadataVersion") or "" if metadata_version: - extra_datacite['metadataVersion'] = metadata_version + extra_datacite["metadataVersion"] = metadata_version # Include resource types. - types = attributes.get('types', {}) or {} - resource_type = types.get('resourceType', '') or '' - resource_type_general = types.get('resourceTypeGeneral', '') or '' + types = attributes.get("types", {}) or {} + resource_type = types.get("resourceType", "") or "" + resource_type_general = types.get("resourceTypeGeneral", "") or "" if resource_type and resource_type.lower() not in UNKNOWN_MARKERS_LOWER: - extra_datacite['resourceType'] = resource_type + extra_datacite["resourceType"] = resource_type if resource_type_general and resource_type_general.lower() not in UNKNOWN_MARKERS_LOWER: - extra_datacite['resourceTypeGeneral'] = resource_type_general + extra_datacite["resourceTypeGeneral"] = resource_type_general # Include certain relations from relatedIdentifiers. Keeping the # original structure of data here, which is a list of dicts, with # relation type, identifier and identifier type (mostly). relations = [] for rel in relIds: - if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', - 'IsVariantFormOf', 'IsSupplementTo', - 'HasVersion', 'IsMetadataFor', - 'IsNewVersionOf', 'IsIdenticalTo', - 'IsVersionOf', 'IsDerivedFrom', - 'IsSourceOf'): + if rel.get("relationType") in ( + "IsPartOf", + "Reviews", + "Continues", + "IsVariantFormOf", + "IsSupplementTo", + "HasVersion", + "IsMetadataFor", + "IsNewVersionOf", + "IsIdenticalTo", + "IsVersionOf", + "IsDerivedFrom", + "IsSourceOf", + ): relations.append(rel) if relations: - extra_datacite['relations'] = relations + extra_datacite["relations"] = relations extra = dict() @@ -640,18 +683,18 @@ class DataciteImporter(EntityImporter): # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null, # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0", # "10161", "10010691", "10780", # "Presentación" - version = attributes.get('version') or None + version = attributes.get("version") or None # top-level extra keys if not container_id and container_name: - extra['container_name'] = container_name + extra["container_name"] = container_name # Always include datacite key, even if value is empty (dict). - extra['datacite'] = extra_datacite + extra["datacite"] = extra_datacite # Preparation for a schema update. if release_month: - extra['release_month'] = release_month + extra["release_month"] = release_month extids = self.lookup_ext_ids(doi=doi) @@ -669,12 +712,12 @@ class DataciteImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids['pmid'], - pmcid=extids['pmcid'], - wikidata_qid=extids['wikidata_qid'], - core=extids['core_id'], - arxiv=extids['arxiv_id'], - jstor=extids['jstor_id'], + pmid=extids["pmid"], + pmcid=extids["pmcid"], + wikidata_qid=extids["wikidata_qid"], + core=extids["core_id"], + arxiv=extids["arxiv_id"], + jstor=extids["jstor_id"], ), contribs=contribs, volume=volume, @@ -702,19 +745,19 @@ class DataciteImporter(EntityImporter): """ release_type = None - if not attributes.get('types'): + if not attributes.get("types"): return None - types = attributes['types'] + types = attributes["types"] - for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): + for typeType in ("citeproc", "ris", "schemaOrg", "bibtex", "resourceTypeGeneral"): value = types.get(typeType) release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: break # special case: figshare "collections" which group other entities - if doi.startswith('10.6084/') or doi.startswith('10.25384'): - if types.get('resourceType') == "Collection": + if doi.startswith("10.6084/") or doi.startswith("10.25384"): + if types.get("resourceType") == "Collection": release_type = "stub" if release_type is None: @@ -736,35 +779,41 @@ class DataciteImporter(EntityImporter): # publishes highly interesting datasets, but titles are mostly the same # ("GBIF Occurrence Download" or "Occurrence Download"); set # release_type to "stub" (CSL/FC). - if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'): - re.release_type = 'stub' + if re.title == "GBIF Occurrence Download" and re.ext_ids.doi.startswith("10.15468/dl."): + re.release_type = "stub" # release_type exception: lots of "Experimental Crystal Structure Determination" # publisher: "Cambridge Crystallographic Data Centre" - if re.ext_ids.doi.startswith('10.5517/'): - re.release_type = 'entry' + if re.ext_ids.doi.startswith("10.5517/"): + re.release_type = "entry" # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." - if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'): - re.release_type = 'component' + if re.title.lower().startswith("additional file") and re.release_type in ( + "article", + "article-journal", + ): + re.release_type = "component" # figshare - if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'): + if re.ext_ids.doi.startswith("10.6084/") or re.ext_ids.doi.startswith("10.25384"): # set version if DOI ends with versioned suffix - doi_suffix = re.ext_ids.doi.split('.')[-1] - if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit(): + doi_suffix = re.ext_ids.doi.split(".")[-1] + if doi_suffix and doi_suffix.startswith("v") and doi_suffix[1:].isdigit(): re.version = doi_suffix # "Figure 123 from " -> component # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean" - if " from " in re.title and re.release_type not in ('stub', 'graphic'): + if " from " in re.title and re.release_type not in ("stub", "graphic"): if re.title.startswith("Figure "): re.release_type = "component" elif re.title.startswith("Table "): re.release_type = "component" # figshare.com - if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None: - re.extra['container_name'] = "figshare.com" + if ( + re.ext_ids.doi.startswith("10.6084/m9.figshare.") + and re.extra.get("container_name") is None + ): + re.extra["container_name"] = "figshare.com" return re @@ -788,26 +837,28 @@ class DataciteImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - print('inserting batch ({})'.format(len(batch)), file=sys.stderr) + print("inserting batch ({})".format(len(batch)), file=sys.stderr) if self.insert_log_file: - with open(self.insert_log_file, 'a') as f: + with open(self.insert_log_file, "a") as f: for doc in batch: json.dump(entity_to_dict(doc, api_client=None), f) - f.write('\n') + f.write("\n") self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) - def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None): + def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None): """ Parses a list of creators into a list of ReleaseContrib objects. Set set_index to False, if the index contrib field should be left blank. @@ -820,48 +871,53 @@ class DataciteImporter(EntityImporter): contribs = [] # Names, that should be ignored right away. - name_blocklist = set(('Occdownload Gbif.Org',)) + name_blocklist = set(("Occdownload Gbif.Org",)) i = 0 for c in creators: if not set_index: i = None - nameType = c.get('nameType', '') or '' - if nameType in ('', 'Personal'): + nameType = c.get("nameType", "") or "" + if nameType in ("", "Personal"): creator_id = None - for nid in c.get('nameIdentifiers', []) or []: + for nid in c.get("nameIdentifiers", []) or []: if not isinstance(nid, dict): # see: fatcat-workers/issues/44035/ - print('unexpected nameIdentifiers, expected list of dicts, got: {}'.format(nid), file=sys.stderr) + print( + "unexpected nameIdentifiers, expected list of dicts, got: {}".format( + nid + ), + file=sys.stderr, + ) continue - name_scheme = nid.get('nameIdentifierScheme', '') or '' + name_scheme = nid.get("nameIdentifierScheme", "") or "" if not name_scheme.lower() == "orcid": continue - orcid = nid.get('nameIdentifier') or '' - orcid = orcid.replace('https://orcid.org/', '') + orcid = nid.get("nameIdentifier") or "" + orcid = orcid.replace("https://orcid.org/", "") if not orcid: continue creator_id = self.lookup_orcid(orcid) # TODO(martin): If creator_id is None, should we create creators? # If there are multiple affiliation strings, use the first one. - affiliations = c.get('affiliation', []) or [] + affiliations = c.get("affiliation", []) or [] raw_affiliation = None if len(affiliations) == 0: raw_affiliation = None else: raw_affiliation = clean(affiliations[0]) - name = c.get('name') - given_name = c.get('givenName') - surname = c.get('familyName') + name = c.get("name") + given_name = c.get("givenName") + surname = c.get("familyName") if name: name = clean(name) if not any((name, given_name, surname)): continue if not name: - name = "{} {}".format(given_name or '', surname or '').strip() + name = "{} {}".format(given_name or "", surname or "").strip() if name in name_blocklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -881,7 +937,7 @@ class DataciteImporter(EntityImporter): if not name: continue - if raw_affiliation == '': + if raw_affiliation == "": continue extra = None @@ -891,39 +947,38 @@ class DataciteImporter(EntityImporter): # "RelatedPerson", "ProjectLeader", "Editor", "Other", # "ProjectMember", "Funder", "RightsHolder", "DataCollector", # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" - contributorType = c.get('contributorType', '') or '' + contributorType = c.get("contributorType", "") or "" if contributorType: - extra = {'type': contributorType} + extra = {"type": contributorType} rc = fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=i, - raw_name=name, - given_name=given_name, - surname=surname, - role=role, - raw_affiliation=raw_affiliation, - extra=extra, - ) + creator_id=creator_id, + index=i, + raw_name=name, + given_name=given_name, + surname=surname, + role=role, + raw_affiliation=raw_affiliation, + extra=extra, + ) # Filter out duplicates early. if not contributor_list_contains_contributor(contribs, rc): contribs.append(rc) if i is not None: i += 1 - elif nameType == 'Organizational': - name = c.get('name', '') or '' + elif nameType == "Organizational": + name = c.get("name", "") or "" if name in UNKNOWN_MARKERS: continue if len(name) < 3: continue - extra = {'organization': name} - contribs.append(fatcat_openapi_client.ReleaseContrib( - index=i, extra=extra)) + extra = {"organization": name} + contribs.append(fatcat_openapi_client.ReleaseContrib(index=i, extra=extra)) if i is not None: i += 1 else: - print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + print("[{}] unknown name type: {}".format(doi, nameType), file=sys.stderr) return contribs @@ -935,8 +990,8 @@ def contributor_list_contains_contributor(contributor_list, contributor): for cc in contributor_list: if cc.raw_name != contributor.raw_name: continue - cc_role = cc.role or 'author' - contributor_role = contributor.role or 'author' + cc_role = cc.role or "author" + contributor_role = contributor.role or "author" if cc_role != contributor_role: continue return True @@ -952,91 +1007,97 @@ def lookup_license_slug(raw): if not raw: return None - if 'creativecommons.org/publicdomain/zero' in raw: - return 'CC-0' - if raw.lower().endswith('/cc0'): - return 'CC-0' + if "creativecommons.org/publicdomain/zero" in raw: + return "CC-0" + if raw.lower().endswith("/cc0"): + return "CC-0" - if 'creativecommons' in raw: + if "creativecommons" in raw: # https://creativecommons.org/publicdomain/mark/1.0/deed.de - if 'creativecommons.org/publicdomain' in raw: - return 'CC-PUBLICDOMAIN' - if 'creativecommons.org/share-your-work/public-domain/cc0' in raw: - return 'CC-0' + if "creativecommons.org/publicdomain" in raw: + return "CC-PUBLICDOMAIN" + if "creativecommons.org/share-your-work/public-domain/cc0" in raw: + return "CC-0" # https://creativecommons.org/licenses/by/4.0/deed.es_ES raw = raw.lower() - match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw, re.IGNORECASE) + match = re.search( + r"creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)", raw, re.IGNORECASE + ) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None - if not name.startswith('cc'): - name = 'cc-{}'.format(name) + if not name.startswith("cc"): + name = "cc-{}".format(name) return name.upper() - if 'opensource.org' in raw: + if "opensource.org" in raw: # https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2 - match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw, re.IGNORECASE) + match = re.search(r"opensource.org/licenses/(?P<name>[^/]+)", raw, re.IGNORECASE) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 11: return None return name.upper() - if 'gnu.org' in raw: + if "gnu.org" in raw: # http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html - match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE) + match = re.search( + r"/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)", + raw, + re.IGNORECASE, + ) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 8: return None return name.upper() - if 'spdx.org' in raw: - if 'spdx.org/licenses/CC0' in raw: - return 'CC-0' + if "spdx.org" in raw: + if "spdx.org/licenses/CC0" in raw: + return "CC-0" # https://spdx.org/licenses/CC-BY-NC-ND-4.0.html - match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw, re.IGNORECASE) + match = re.search(r"spdx.org/licenses/(?P<name>[a-z0-9-]+)", raw, re.IGNORECASE) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 36: return None # cleanup version and extensions - name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower()) + name = re.sub("(-[0-9])?[.]?[0-9]?(.json|.html)?", "", name.lower()) return name.upper() - if 'rightsstatements.org' in raw: + if "rightsstatements.org" in raw: # http://rightsstatements.org/vocab/InC/1.0/ - match = re.search(r'rightsstatements.org/(vocab|page)/(?P<name>[^/]*)', raw) + match = re.search(r"rightsstatements.org/(vocab|page)/(?P<name>[^/]*)", raw) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 9: return None - return 'RS-{}'.format(name.upper()) + return "RS-{}".format(name.upper()) # Fallback to mapped values. raw = raw.lower() - raw = raw.strip().replace('http://', '//').replace('https://', '//') - if not raw.endswith('/'): - raw = raw + '/' + raw = raw.strip().replace("http://", "//").replace("https://", "//") + if not raw.endswith("/"): + raw = raw + "/" return LICENSE_SLUG_MAP.get(raw) @@ -1046,23 +1107,21 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3): Example input: {'title': 'Some title', 'original_language_title': 'Some title'} """ - if 'original_language_title' not in item: + if "original_language_title" not in item: return None - title = item.get('title') + title = item.get("title") if not title: return None - original_language_title = item.get('original_language_title') - if isinstance(original_language_title, - str) and title != original_language_title: + original_language_title = item.get("original_language_title") + if isinstance(original_language_title, str) and title != original_language_title: if len(original_language_title) < min_length: return None - if original_language_title.count('?') > max_questionmarks: + if original_language_title.count("?") > max_questionmarks: return None return original_language_title if isinstance(original_language_title, dict): - content = original_language_title.get('__content__', '') or '' - if content and content != title and not content.count( - '?') > max_questionmarks: + content = original_language_title.get("__content__", "") or "" + if content and content != title and not content.count("?") > max_questionmarks: return content return None @@ -1082,23 +1141,23 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle elif len(titles) == 1: original_language_title = find_original_language_title(titles[0]) - title = titles[0].get('title', '') or '' + title = titles[0].get("title", "") or "" title = title.strip() if not title: title = None return title, original_language_title, subtitle else: for entry in titles: - if not title and ('titleType' not in entry - or not entry.get('titleType')): - title = (entry.get('title') or '').strip() - if not subtitle and entry.get('titleType') == 'Subtitle': - subtitle = entry.get('title', '').strip() + if not title and ("titleType" not in entry or not entry.get("titleType")): + title = (entry.get("title") or "").strip() + if not subtitle and entry.get("titleType") == "Subtitle": + subtitle = entry.get("title", "").strip() if not original_language_title: original_language_title = find_original_language_title(entry) return title, original_language_title, subtitle + def parse_single_date(value): """ Given a single string containing a date in arbitrary format, try to return @@ -1113,11 +1172,11 @@ def parse_single_date(value): # Results in a dict with keys: date_obj, period, locale. parse_result = parser.get_date_data(value) # A datetime object, later we need a date, only. - result = parse_result['date_obj'] + result = parse_result["date_obj"] if result is not None: - if parse_result['period'] == 'year': + if parse_result["period"] == "year": return None, None, result.year - elif parse_result['period'] == 'month': + elif parse_result["period"] == "month": return None, result.month, result.year else: return result.date(), result.month, result.year @@ -1126,6 +1185,7 @@ def parse_single_date(value): return None, None, None + def parse_datacite_dates(dates): """ Given a list of date fields (under .dates), return tuple, (release_date, @@ -1137,37 +1197,37 @@ def parse_datacite_dates(dates): return release_date, release_month, release_year if not isinstance(dates, list): - raise ValueError('expected a list of date items') + raise ValueError("expected a list of date items") # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted", # "Collected", "Updated", "Copyrighted", "Created" # Ignored for now: "Collected", "Issued" date_type_prio = ( - 'Valid', - 'Available', - 'Accepted', - 'Submitted', - 'Copyrighted', - 'Created', - 'Updated', + "Valid", + "Available", + "Accepted", + "Submitted", + "Copyrighted", + "Created", + "Updated", ) # We need to note the granularity, since a string like "2019" would be # parsed into "2019-01-01", even though the month is unknown. Use 3 # granularity types: 'y', 'm', 'd'. - Pattern = collections.namedtuple('Pattern', 'layout granularity') + Pattern = collections.namedtuple("Pattern", "layout granularity") # Before using (expensive) dateparser, try a few common patterns. common_patterns = ( - Pattern('%Y-%m-%d', 'd'), - Pattern('%Y-%m', 'm'), - Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), - Pattern('%Y-%m-%dT%H:%M:%S', 'd'), - Pattern('%Y', 'y'), + Pattern("%Y-%m-%d", "d"), + Pattern("%Y-%m", "m"), + Pattern("%Y-%m-%dT%H:%M:%SZ", "d"), + Pattern("%Y-%m-%dT%H:%M:%S", "d"), + Pattern("%Y", "y"), ) def parse_item(item): - result, value, year_only = None, str(item.get('date', '')) or '', False + result, value, year_only = None, str(item.get("date", "")) or "", False release_date, release_month, release_year = None, None, None for layout, granularity in common_patterns: @@ -1176,22 +1236,22 @@ def parse_datacite_dates(dates): except ValueError: continue else: - if granularity == 'y': + if granularity == "y": year_only = True break if result is None: - print('fallback for {}'.format(value), file=sys.stderr) + print("fallback for {}".format(value), file=sys.stderr) release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. return release_date, release_month, release_year - if granularity != 'y': + if granularity != "y": release_date = result.date() release_year = result.year - if granularity in ('m', 'd'): + if granularity in ("m", "d"): release_month = result.month return release_date, release_month, release_year @@ -1200,7 +1260,7 @@ def parse_datacite_dates(dates): for prio in date_type_prio: for item in dates: - if not item.get('dateType') == prio: + if not item.get("dateType") == prio: continue release_date, release_month, release_year = parse_item(item) @@ -1224,45 +1284,49 @@ def parse_datacite_dates(dates): return release_date, release_month, release_year + def index_form_to_display_name(s): """ Try to convert an index form name, like 'Razis, Panos A' into display_name, e.g. 'Panos A Razis'. """ - if ',' not in s: + if "," not in s: return s - skip_on_chars = ['(', ')', '*'] + skip_on_chars = ["(", ")", "*"] for char in skip_on_chars: if char in s: return s - if s.count(',') > 1: + if s.count(",") > 1: # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" return s # Not names, but sprinkled in fields where authors live. - stopwords = [s.lower() for s in ( - 'Archive', - 'Collection', - 'Coordinator', - 'Department', - 'Germany', - 'International', - 'National', - 'Netherlands', - 'Office', - 'Organisation', - 'Organization', - 'Service', - 'Services', - 'United States', - 'University', - 'Verein', - 'Volkshochschule', - )] + stopwords = [ + s.lower() + for s in ( + "Archive", + "Collection", + "Coordinator", + "Department", + "Germany", + "International", + "National", + "Netherlands", + "Office", + "Organisation", + "Organization", + "Service", + "Services", + "United States", + "University", + "Verein", + "Volkshochschule", + ) + ] lower = s.lower() for stop in stopwords: if stop in lower: return s - a, b = s.split(',') - return '{} {}'.format(b.strip(), a.strip()) + a, b = s.split(",") + return "{} {}".format(b.strip(), a.strip()) diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py index 3d280fb7..603a6271 100644 --- a/python/fatcat_tools/importers/dblp_container.py +++ b/python/fatcat_tools/importers/dblp_container.py @@ -1,4 +1,3 @@ - """ Importer for DBLP container-level (journal/conference/series) metadata, pre-scraped in to JSON from HTML pages. @@ -13,17 +12,17 @@ from fatcat_tools.normal import clean_str class DblpContainerImporter(EntityImporter): + def __init__( + self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs + ): - def __init__(self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs): - - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata scraped from dblp HTML") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DblpContainerImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of container-level metadata scraped from dblp HTML", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpContainerImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.dblp_container_map_output = dblp_container_map_output self.read_dblp_container_map_file(dblp_container_map_file) @@ -40,7 +39,10 @@ class DblpContainerImporter(EntityImporter): assert len(container_id) == 26 self._dblp_container_map[prefix] = container_id print("\t".join([prefix, container_id]), file=self.dblp_container_map_output) - print("Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) + print( + "Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), + file=sys.stderr, + ) def lookup_dblp_prefix(self, prefix): if not prefix: @@ -57,48 +59,48 @@ class DblpContainerImporter(EntityImporter): returns a ContainerEntity (or None if invalid or couldn't parse) """ - dblp_prefix = row.get('key') or row.get('dblp_prefix') + dblp_prefix = row.get("key") or row.get("dblp_prefix") assert dblp_prefix - assert row['title'] + assert row["title"] container_type = None - if dblp_prefix.startswith('conf/'): + if dblp_prefix.startswith("conf/"): container_type = "conference-series" - elif dblp_prefix.startswith('journals/'): + elif dblp_prefix.startswith("journals/"): container_type = "journal" - elif dblp_prefix.startswith('series/'): + elif dblp_prefix.startswith("series/"): container_type = "book-series" issnl = None - for issn in row.get('issns', []): + for issn in row.get("issns", []): issnl = self.issn2issnl(issn) if issnl: break extra = { - 'dblp': { - 'prefix': dblp_prefix, + "dblp": { + "prefix": dblp_prefix, }, } - if row.get('homepage_url'): - extra['urls'] = [row['homepage_url']] + if row.get("homepage_url"): + extra["urls"] = [row["homepage_url"]] - if row.get('acronym'): - extra['acronym'] = row['acronym'] + if row.get("acronym"): + extra["acronym"] = row["acronym"] ce = fatcat_openapi_client.ContainerEntity( - name=clean_str(row['title']), + name=clean_str(row["title"]), container_type=container_type, issnl=issnl, - wikidata_qid=row.get('wikidata_qid'), + wikidata_qid=row.get("wikidata_qid"), extra=extra, ) return ce def try_update(self, ce): - dblp_prefix = ce.extra['dblp']['prefix'] + dblp_prefix = ce.extra["dblp"]["prefix"] existing = None existing_container_id = self.lookup_dblp_prefix(dblp_prefix) if existing_container_id: @@ -123,8 +125,11 @@ class DblpContainerImporter(EntityImporter): return True if existing: - self.counts['exists'] += 1 - print("\t".join([ce.extra['dblp']['prefix'], existing.ident]), file=self.dblp_container_map_output) + self.counts["exists"] += 1 + print( + "\t".join([ce.extra["dblp"]["prefix"], existing.ident]), + file=self.dblp_container_map_output, + ) return False # shouldn't get here @@ -135,11 +140,17 @@ class DblpContainerImporter(EntityImporter): Because we want to print a prefix/container_id match for each row, we require a special batch insert method """ - eg = self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + eg = self.api.create_container_auto_batch( + fatcat_openapi_client.ContainerAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) for c_edit in eg.edits.containers: c = self.api.get_container(c_edit.ident) - print("\t".join([c.extra['dblp']['prefix'], c.ident]), file=self.dblp_container_map_output) + print( + "\t".join([c.extra["dblp"]["prefix"], c.ident]), + file=self.dblp_container_map_output, + ) diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 6d028f2f..5baa6cd6 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -1,4 +1,3 @@ - """ Importer for DBLP release-level (article/paper/etc) XML metadata. @@ -44,25 +43,16 @@ from fatcat_tools.transforms import entity_to_dict class DblpReleaseImporter(EntityImporter): - - def __init__(self, - api, - dblp_container_map_file=None, - **kwargs): + def __init__(self, api, dblp_container_map_file=None, **kwargs): eg_desc = kwargs.get( - 'editgroup_description', - "Automated import of dblp metadata via XML records" + "editgroup_description", "Automated import of dblp metadata via XML records" ) - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', - 'fatcat_tools.DblpReleaseImporter') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpReleaseImporter") # ensure default is to not do updates with this worker (override super() default) - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.dump_json_mode = kwargs.get("dump_json_mode", False) self.this_year = datetime.datetime.now().year @@ -76,13 +66,16 @@ class DblpReleaseImporter(EntityImporter): "phdthesis", "mastersthesis", "www", - #"data", # no instances in 2020-11 dump + # "data", # no instances in 2020-11 dump ] def read_dblp_container_map_file(self, dblp_container_map_file) -> None: self._dblp_container_map = dict() if not dblp_container_map_file: - print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr) + print( + "Not loading a dblp prefix container map file; entities will fail to import", + file=sys.stderr, + ) return print("Loading dblp prefix container map file...", file=sys.stderr) for line in dblp_container_map_file: @@ -92,7 +85,10 @@ class DblpReleaseImporter(EntityImporter): container_id = container_id.strip() assert len(container_id) == 26 self._dblp_container_map[prefix] = container_id - print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) + print( + "Got {} dblp container mappings.".format(len(self._dblp_container_map)), + file=sys.stderr, + ) def lookup_dblp_prefix(self, prefix): if not prefix: @@ -101,13 +97,13 @@ class DblpReleaseImporter(EntityImporter): def want(self, xml_elem): if xml_elem.name not in self.ELEMENT_TYPES: - self.counts['skip-type'] += 1 + self.counts["skip-type"] += 1 return False - if not xml_elem.get('key'): - self.counts['skip-no-key'] += 1 + if not xml_elem.get("key"): + self.counts["skip-no-key"] += 1 return False - if xml_elem['key'].startswith('homepage/'): - self.counts['skip-type-homepage'] += 1 + if xml_elem["key"].startswith("homepage/"): + self.counts["skip-type-homepage"] += 1 return False return True @@ -127,88 +123,88 @@ class DblpReleaseImporter(EntityImporter): - isbn """ - dblp_key = xml_elem.get('key') + dblp_key = xml_elem.get("key") if not dblp_key: - self.counts['skip-empty-key'] += 1 + self.counts["skip-empty-key"] += 1 return False - dblp_key_type = dblp_key.split('/')[0] + dblp_key_type = dblp_key.split("/")[0] # dblp_prefix may be used for container lookup dblp_prefix = None - if dblp_key_type in ('journals', 'conf'): - dblp_prefix = '/'.join(dblp_key.split('/')[:2]) - elif dblp_key_type in ('series', 'reference', 'tr', 'books'): - dblp_prefix = '/'.join(dblp_key.split('/')[:-1]) + if dblp_key_type in ("journals", "conf"): + dblp_prefix = "/".join(dblp_key.split("/")[:2]) + elif dblp_key_type in ("series", "reference", "tr", "books"): + dblp_prefix = "/".join(dblp_key.split("/")[:-1]) - publtype = xml_elem.get('publtype') or None + publtype = xml_elem.get("publtype") or None dblp_type = xml_elem.name if dblp_type not in self.ELEMENT_TYPES: - self.counts[f'skip-dblp-type:{dblp_type}'] += 1 + self.counts[f"skip-dblp-type:{dblp_type}"] += 1 - if dblp_key_type in ('homepages', 'persons', 'dblpnote'): - self.counts['skip-key-type'] += 1 + if dblp_key_type in ("homepages", "persons", "dblpnote"): + self.counts["skip-key-type"] += 1 return False - if dblp_key.startswith('journals/corr/'): - self.counts['skip-arxiv-corr'] += 1 + if dblp_key.startswith("journals/corr/"): + self.counts["skip-arxiv-corr"] += 1 return False title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True) if not title: - self.counts['skip-title'] += 1 + self.counts["skip-title"] += 1 return False - if title.endswith('.'): + if title.endswith("."): title = title[:-1] release_type = None - release_stage = 'published' + release_stage = "published" withdrawn_status = None # primary releae_type detection: type of XML element, then prefix of key for granularity - if dblp_type == 'article': - release_type = 'article' - if dblp_key_type == 'journals' and publtype != 'informal': - release_type = 'article-journal' - elif dblp_key_type == 'tr': - release_type = 'report' + if dblp_type == "article": + release_type = "article" + if dblp_key_type == "journals" and publtype != "informal": + release_type = "article-journal" + elif dblp_key_type == "tr": + release_type = "report" elif title.startswith("Review:"): - release_type = 'review' - elif dblp_type == 'inproceedings': - release_type = 'paper-conference' - elif dblp_type == 'book': - release_type = 'book' - elif dblp_type == 'incollection': + release_type = "review" + elif dblp_type == "inproceedings": + release_type = "paper-conference" + elif dblp_type == "book": + release_type = "book" + elif dblp_type == "incollection": # XXX: part vs. chapter? - release_type = 'chapter' - elif dblp_type == 'data': - release_type = 'dataset' - elif dblp_type in ('mastersthesis', 'phdthesis'): - release_type = 'thesis' + release_type = "chapter" + elif dblp_type == "data": + release_type = "dataset" + elif dblp_type in ("mastersthesis", "phdthesis"): + release_type = "thesis" # overrides/extensions of the above - if publtype == 'informal': + if publtype == "informal": # for conferences, seems to indicate peer-review status # for journals, seems to indicate things like book reviews; split out above pass - elif publtype == 'encyclopedia': - release_type = 'entry-encyclopedia' - elif publtype == 'edited': + elif publtype == "encyclopedia": + release_type = "entry-encyclopedia" + elif publtype == "edited": # XXX: article? - release_type = 'editorial' - elif publtype == 'data': - release_type = 'dataset' - elif publtype == 'data': - release_type = 'dataset' - elif publtype == 'software': - release_type = 'software' - elif publtype == 'widthdrawn': - withdrawn_status = 'widthdrawn' - elif publtype == 'survey': + release_type = "editorial" + elif publtype == "data": + release_type = "dataset" + elif publtype == "data": + release_type = "dataset" + elif publtype == "software": + release_type = "software" + elif publtype == "widthdrawn": + withdrawn_status = "widthdrawn" + elif publtype == "survey": # XXX: flag as a review/survey article? pass - #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) + # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) container_name = None booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text) @@ -236,7 +232,9 @@ class DblpReleaseImporter(EntityImporter): part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text) # block bogus far-future years/dates - if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + if release_year is not None and ( + release_year > (self.this_year + 5) or release_year < 1000 + ): release_month = None release_year = None @@ -245,39 +243,39 @@ class DblpReleaseImporter(EntityImporter): if isbn: ext_ids.isbn13 = isbn if ext_ids.doi: - self.counts['has-doi'] += 1 + self.counts["has-doi"] += 1 # dblp-specific extra dblp_extra = dict(type=dblp_type) note = clean_str(xml_elem.note and xml_elem.note.text) - if note and 'base-search.net' not in note: - dblp_extra['note'] = note + if note and "base-search.net" not in note: + dblp_extra["note"] = note if part_of_key: - dblp_extra['part_of_key'] = part_of_key + dblp_extra["part_of_key"] = part_of_key # generic extra extra = dict() if not container_id and container_name: - extra['container_name'] = container_name + extra["container_name"] = container_name - if series and (dblp_key_type == 'series' or dblp_type == 'book'): - extra['series-title'] = series + if series and (dblp_key_type == "series" or dblp_type == "book"): + extra["series-title"] = series elif series: - dblp_extra['series'] = series + dblp_extra["series"] = series - if booktitle and dblp_key_type == 'series': - extra['container-title'] = booktitle - elif booktitle and dblp_key_type == 'conf': - extra['event'] = booktitle + if booktitle and dblp_key_type == "series": + extra["container-title"] = booktitle + elif booktitle and dblp_key_type == "conf": + extra["event"] = booktitle elif booktitle: - dblp_extra['booktitle'] = booktitle + dblp_extra["booktitle"] = booktitle if release_year and release_month: # TODO: release_month schema migration - extra['release_month'] = release_month + extra["release_month"] = release_month if dblp_extra: - extra['dblp'] = dblp_extra + extra["dblp"] = dblp_extra if not extra: extra = None @@ -289,7 +287,7 @@ class DblpReleaseImporter(EntityImporter): withdrawn_status=withdrawn_status, title=title, release_year=release_year, - #release_date, + # release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs, @@ -302,8 +300,8 @@ class DblpReleaseImporter(EntityImporter): if self.dump_json_mode: re_dict = entity_to_dict(re, api_client=self.api.api_client) - re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem) - re_dict['_dblp_prefix'] = dblp_prefix + re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem) + re_dict["_dblp_prefix"] = dblp_prefix print(json.dumps(re_dict, sort_keys=True)) return False @@ -341,11 +339,11 @@ class DblpReleaseImporter(EntityImporter): # then try other ext_id lookups if not existing: - for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'): + for extid_type in ("doi", "wikidata_qid", "isbn13", "arxiv"): extid_val = getattr(re.ext_ids, extid_type) if not extid_val: continue - #print(f" lookup release type: {extid_type} val: {extid_val}") + # print(f" lookup release type: {extid_type} val: {extid_val}") try: existing = self.api.lookup_release(**{extid_type: extid_val}) except fatcat_openapi_client.rest.ApiException as err: @@ -373,12 +371,14 @@ class DblpReleaseImporter(EntityImporter): return True if not self.do_updates or existing.ext_ids.dblp: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # logic for whether to do update or skip - if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv: - self.counts['skip-update'] += 1 + if ( + existing.container_id and existing.release_type and existing.release_stage + ) or existing.ext_ids.arxiv: + self.counts["skip-update"] += 1 return False # fields to copy over for update @@ -390,20 +390,20 @@ class DblpReleaseImporter(EntityImporter): existing.release_stage = existing.release_stage or re.release_stage existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status existing.container_id = existing.container_id or re.container_id - existing.extra['dblp'] = re.extra['dblp'] + existing.extra["dblp"] = re.extra["dblp"] existing.volume = existing.volume or re.volume existing.issue = existing.issue or re.issue existing.pages = existing.pages or re.pages try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: - self.counts['skip-update-conflict'] += 1 + self.counts["skip-update-conflict"] += 1 return False else: raise err @@ -411,11 +411,14 @@ class DblpReleaseImporter(EntityImporter): return False def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: """ @@ -428,14 +431,14 @@ class DblpReleaseImporter(EntityImporter): """ contribs = [] index = 0 - for elem in authors.find_all('author'): + for elem in authors.find_all("author"): contrib = self.dblp_contrib_single(elem) contrib.role = "author" contrib.index = index contribs.append(contrib) index += 1 - for elem in authors.find_all('editor'): + for elem in authors.find_all("editor"): contrib = self.dblp_contrib_single(elem) contrib.role = "editor" contribs.append(contrib) @@ -459,10 +462,10 @@ class DblpReleaseImporter(EntityImporter): # remove number in author name, if present if raw_name.split()[-1].isdigit(): - raw_name = ' '.join(raw_name.split()[:-1]) + raw_name = " ".join(raw_name.split()[:-1]) - if elem.get('orcid'): - orcid = clean_orcid(elem['orcid']) + if elem.get("orcid"): + orcid = clean_orcid(elem["orcid"]) if orcid: creator_id = self.lookup_orcid(orcid) if not creator_id: @@ -491,22 +494,26 @@ class DblpReleaseImporter(EntityImporter): wikidata_qid: Optional[str] = None arxiv_id: Optional[str] = None hdl: Optional[str] = None - for ee in xml_elem.find_all('ee'): + for ee in xml_elem.find_all("ee"): url = ee.text # convert DOI-like domains, which mostly have DOIs anyways - if '://doi.acm.org/' in url: - url = url.replace('://doi.acm.org/', '://doi.org/') - elif '://doi.ieeecomputersociety.org/' in url: - url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/') + if "://doi.acm.org/" in url: + url = url.replace("://doi.acm.org/", "://doi.org/") + elif "://doi.ieeecomputersociety.org/" in url: + url = url.replace("://doi.ieeecomputersociety.org/", "://doi.org/") - if 'doi.org/10.' in url and not doi: + if "doi.org/10." in url and not doi: doi = clean_doi(url) - elif 'wikidata.org/entity/Q' in url and not wikidata_qid: + elif "wikidata.org/entity/Q" in url and not wikidata_qid: wikidata_qid = clean_wikidata_qid(url) - elif '://arxiv.org/abs/' in url and not arxiv_id: - arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '') + elif "://arxiv.org/abs/" in url and not arxiv_id: + arxiv_id = ( + url.replace("http://", "") + .replace("https://", "") + .replace("arxiv.org/abs/", "") + ) arxiv_id = clean_arxiv_id(arxiv_id) - elif '://hdl.handle.net' in url and not hdl: + elif "://hdl.handle.net" in url and not hdl: hdl = clean_hdl(url) return fatcat_openapi_client.ReleaseExtIds( @@ -525,14 +532,14 @@ class DblpReleaseImporter(EntityImporter): sandcrawler ingest requests. """ EXTID_PATTERNS = [ - '://doi.acm.org/', - '://doi.ieeecomputersociety.org/', - 'doi.org/10.', - 'wikidata.org/entity/Q', - '://arxiv.org/abs/', + "://doi.acm.org/", + "://doi.ieeecomputersociety.org/", + "doi.org/10.", + "wikidata.org/entity/Q", + "://arxiv.org/abs/", ] urls = [] - for ee in xml_elem.find_all('ee'): + for ee in xml_elem.find_all("ee"): url = ee.text skip = False for pattern in EXTID_PATTERNS: diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 1831c4cd..cd063337 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -28,26 +28,23 @@ MAX_ABSTRACT_LENGTH = 2048 class DoajArticleImporter(EntityImporter): - - def __init__(self, - api, - issn_map_file, - **kwargs): + def __init__(self, api, issn_map_file, **kwargs): eg_desc = kwargs.get( - 'editgroup_description', - "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps" + "editgroup_description", + "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps", ) - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', - 'fatcat_tools.DoajArticleImporter') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DoajArticleImporter") # ensure default is to not do updates with this worker (override super() default) - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - issn_map_file=issn_map_file, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__( + api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs, + ) self.this_year = datetime.datetime.now().year self.read_issn_map_file(issn_map_file) @@ -82,21 +79,21 @@ class DoajArticleImporter(EntityImporter): } """ - if not obj or not isinstance(obj, dict) or 'bibjson' not in obj: - self.counts['skip-empty'] += 1 + if not obj or not isinstance(obj, dict) or "bibjson" not in obj: + self.counts["skip-empty"] += 1 return None - bibjson = obj['bibjson'] + bibjson = obj["bibjson"] - title = clean_str(bibjson.get('title'), force_xml=True) + title = clean_str(bibjson.get("title"), force_xml=True) if not title: - self.counts['skip-title'] += 1 + self.counts["skip-title"] += 1 return False - container_name = clean_str(bibjson['journal']['title']) + container_name = clean_str(bibjson["journal"]["title"]) container_id = None # NOTE: 'issns' not documented in API schema - for issn in bibjson['journal']['issns']: + for issn in bibjson["journal"]["issns"]: issnl = self.issn2issnl(issn) if issnl: container_id = self.lookup_issnl(self.issn2issnl(issn)) @@ -105,75 +102,83 @@ class DoajArticleImporter(EntityImporter): container_name = None break - volume = clean_str(bibjson['journal'].get('volume')) + volume = clean_str(bibjson["journal"].get("volume")) # NOTE: this schema seems to use "number" as "issue number" - issue = clean_str(bibjson['journal'].get('number')) - publisher = clean_str(bibjson['journal'].get('publisher')) + issue = clean_str(bibjson["journal"].get("number")) + publisher = clean_str(bibjson["journal"].get("publisher")) try: - release_year = int(bibjson.get('year')) + release_year = int(bibjson.get("year")) except (TypeError, ValueError): release_year = None - release_month = parse_month(clean_str(bibjson.get('month'))) + release_month = parse_month(clean_str(bibjson.get("month"))) # block bogus far-future years/dates - if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + if release_year is not None and ( + release_year > (self.this_year + 5) or release_year < 1000 + ): release_month = None release_year = None - license_slug = self.doaj_license_slug(bibjson['journal'].get('license')) - country = parse_country_name(bibjson['journal'].get('country')) + license_slug = self.doaj_license_slug(bibjson["journal"].get("license")) + country = parse_country_name(bibjson["journal"].get("country")) language = None - for raw in bibjson['journal'].get('language') or []: + for raw in bibjson["journal"].get("language") or []: language = parse_lang_name(raw) if language: break # pages # NOTE: error in API docs? seems like start_page not under 'journal' object - start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page')) - end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page')) + start_page = clean_str(bibjson["journal"].get("start_page")) or clean_str( + bibjson.get("start_page") + ) + end_page = clean_str(bibjson["journal"].get("end_page")) or clean_str( + bibjson.get("end_page") + ) pages: Optional[str] = None if start_page and end_page: pages = f"{start_page}-{end_page}" elif start_page: pages = start_page - doaj_article_id = obj['id'].lower() - ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id) + doaj_article_id = obj["id"].lower() + ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id) abstracts = self.doaj_abstracts(bibjson) - contribs = self.doaj_contribs(bibjson.get('author') or []) + contribs = self.doaj_contribs(bibjson.get("author") or []) # DOAJ-specific extra doaj_extra = dict() - if bibjson.get('subject'): - doaj_extra['subject'] = bibjson.get('subject') - if bibjson.get('keywords'): - doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k] + if bibjson.get("subject"): + doaj_extra["subject"] = bibjson.get("subject") + if bibjson.get("keywords"): + doaj_extra["keywords"] = [ + k for k in [clean_str(s) for s in bibjson.get("keywords")] if k + ] # generic extra extra = dict() if country: - extra['country'] = country + extra["country"] = country if not container_id and container_name: - extra['container_name'] = container_name + extra["container_name"] = container_name if release_year and release_month: # TODO: schema migration - extra['release_month'] = release_month + extra["release_month"] = release_month if doaj_extra: - extra['doaj'] = doaj_extra + extra["doaj"] = doaj_extra if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, - release_type='article-journal', - release_stage='published', + release_type="article-journal", + release_stage="published", title=title, release_year=release_year, - #release_date, + # release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs, @@ -208,11 +213,11 @@ class DoajArticleImporter(EntityImporter): # then try other ext_id lookups if not existing: - for extid_type in ('doi', 'pmid', 'pmcid'): + for extid_type in ("doi", "pmid", "pmcid"): extid_val = getattr(re.ext_ids, extid_type) if not extid_val: continue - #print(f" lookup release type: {extid_type} val: {extid_val}") + # print(f" lookup release type: {extid_type} val: {extid_val}") try: existing = self.api.lookup_release(**{extid_type: extid_val}) except fatcat_openapi_client.rest.ApiException as err: @@ -241,7 +246,7 @@ class DoajArticleImporter(EntityImporter): # other logic could go here about skipping updates if not self.do_updates or existing.ext_ids.doaj: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # fields to copy over for update @@ -250,7 +255,7 @@ class DoajArticleImporter(EntityImporter): existing.release_stage = existing.release_stage or re.release_stage existing.container_id = existing.container_id or re.container_id existing.abstracts = existing.abstracts or re.abstracts - existing.extra['doaj'] = re.extra['doaj'] + existing.extra["doaj"] = re.extra["doaj"] existing.volume = existing.volume or re.volume existing.issue = existing.issue or re.issue existing.pages = existing.pages or re.pages @@ -258,13 +263,13 @@ class DoajArticleImporter(EntityImporter): try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: - self.counts['skip-update-conflict'] += 1 + self.counts["skip-update-conflict"] += 1 return False else: raise err @@ -272,14 +277,17 @@ class DoajArticleImporter(EntityImporter): return False def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]: - text = clean_str(bibjson.get('abstract')) + text = clean_str(bibjson.get("abstract")) if not text or len(text) < 10: return [] if len(text) > MAX_ABSTRACT_LENGTH: @@ -293,7 +301,9 @@ class DoajArticleImporter(EntityImporter): lang=lang, ) - return [abstract,] + return [ + abstract, + ] def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: """ @@ -306,23 +316,27 @@ class DoajArticleImporter(EntityImporter): contribs = [] index = 0 for author in authors: - if not author.get('name'): + if not author.get("name"): continue creator_id = None - orcid = clean_orcid(author.get('orcid_id')) + orcid = clean_orcid(author.get("orcid_id")) if orcid: creator_id = self.lookup_orcid(orcid) - contribs.append(fatcat_openapi_client.ReleaseContrib( - raw_name=author.get('name'), - role='author', - index=index, - creator_id=creator_id, - raw_affiliation=clean_str(author.get('affiliation')), - )) + contribs.append( + fatcat_openapi_client.ReleaseContrib( + raw_name=author.get("name"), + role="author", + index=index, + creator_id=creator_id, + raw_affiliation=clean_str(author.get("affiliation")), + ) + ) index += 1 return contribs - def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds: + def doaj_ext_ids( + self, identifiers: List[dict], doaj_article_id: str + ) -> fatcat_openapi_client.ReleaseExtIds: """ bibjson.identifier { id (string), @@ -336,14 +350,14 @@ class DoajArticleImporter(EntityImporter): pmid: Optional[str] = None pmcid: Optional[str] = None for id_obj in identifiers: - if not id_obj.get('id'): + if not id_obj.get("id"): continue - if id_obj['type'].lower() == 'doi': - doi = clean_doi(id_obj['id']) - elif id_obj['type'].lower() == 'pmid': - pmid = clean_pmid(id_obj['id']) - elif id_obj['type'].lower() == 'pmcid': - pmcid = clean_pmcid(id_obj['id']) + if id_obj["type"].lower() == "doi": + doi = clean_doi(id_obj["id"]) + elif id_obj["type"].lower() == "pmid": + pmid = clean_pmid(id_obj["id"]) + elif id_obj["type"].lower() == "pmcid": + pmcid = clean_pmcid(id_obj["id"]) return fatcat_openapi_client.ReleaseExtIds( doaj=doaj_article_id, @@ -365,10 +379,10 @@ class DoajArticleImporter(EntityImporter): if not license_list: return None for license in license_list: - if not license.get('open_access'): + if not license.get("open_access"): continue - slug = license.get('type') - if slug.startswith('CC '): - slug = slug.replace('CC ', 'cc-').lower() + slug = license.get("type") + if slug.startswith("CC "): + slug = slug.replace("CC ", "cc-").lower() return slug return None diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 0951ed84..26584ff3 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from .common import EntityImporter @@ -17,19 +16,16 @@ class FileMetaImporter(EntityImporter): def __init__(self, api, require_grobid=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter') - kwargs['do_updates'] = kwargs.get("do_updates", True) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates" + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileMetaImporter") + kwargs["do_updates"] = kwargs.get("do_updates", True) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): - for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'): + for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"): if not row.get(k): - self.counts['skip-missing-field'] += 1 + self.counts["skip-missing-field"] += 1 return False return True @@ -40,11 +36,11 @@ class FileMetaImporter(EntityImporter): file_meta = row fe = fatcat_openapi_client.FileEntity( - md5=file_meta['md5hex'], - sha1=file_meta['sha1hex'], - sha256=file_meta['sha256hex'], - size=file_meta['size_bytes'], - mimetype=file_meta['mimetype'], + md5=file_meta["md5hex"], + sha1=file_meta["sha1hex"], + sha256=file_meta["sha256hex"], + size=file_meta["size_bytes"], + mimetype=file_meta["mimetype"], ) return fe @@ -59,11 +55,11 @@ class FileMetaImporter(EntityImporter): raise err if not existing: - self.counts['skip-no-match'] += 1 + self.counts["skip-no-match"] += 1 return False - if (existing.md5 and existing.sha256 and existing.size and existing.mimetype): - self.counts['skip-existing-complete'] += 1 + if existing.md5 and existing.sha256 and existing.size and existing.mimetype: + self.counts["skip-existing-complete"] += 1 return False existing.md5 = existing.md5 or fe.md5 @@ -75,5 +71,5 @@ class FileMetaImporter(EntityImporter): existing = self.generic_file_cleanups(existing) self.api.update_file(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 return False diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py index 43c2a49c..dd8f5600 100644 --- a/python/fatcat_tools/importers/fileset_generic.py +++ b/python/fatcat_tools/importers/fileset_generic.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from fatcat_tools import entity_from_dict @@ -20,34 +19,31 @@ class FilesetImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Generic Fileset entity import" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FilesetImporter') - kwargs['do_updates'] = bool(kwargs.get("do_updates", False)) + eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import" + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FilesetImporter") + kwargs["do_updates"] = bool(kwargs.get("do_updates", False)) self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False)) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False def want(self, row): - if not row.get('release_ids'): - self.counts['skip-no-release-ids'] += 1 + if not row.get("release_ids"): + self.counts["skip-no-release-ids"] += 1 return False - if not row.get('urls'): - self.counts['skip-no-urls'] += 1 + if not row.get("urls"): + self.counts["skip-no-urls"] += 1 return False - if not row.get('manifest'): - self.counts['skip-no-files'] += 1 + if not row.get("manifest"): + self.counts["skip-no-files"] += 1 return False - for f in row.get('manifest'): - for k in ('sha1', 'md5'): + for f in row.get("manifest"): + for k in ("sha1", "md5"): if not f.get(k): - self.counts['skip-missing-file-field'] += 1 + self.counts["skip-missing-file-field"] += 1 return False return True @@ -66,19 +62,24 @@ class FilesetImporter(EntityImporter): if not self.skip_release_fileset_check: for release_id in fse.release_ids: # don't catch 404, that would be an error - release = self.api.get_release(release_id, expand='filesets', hide='abstracts,refs') - assert release.state == 'active' + release = self.api.get_release( + release_id, expand="filesets", hide="abstracts,refs" + ) + assert release.state == "active" if release.filesets: - self.counts['exists'] += 1 - self.counts['exists-via-release-filesets'] += 1 + self.counts["exists"] += 1 + self.counts["exists-via-release-filesets"] += 1 return False # do the insert return True def insert_batch(self, batch): - self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_fileset_auto_batch( + fatcat_openapi_client.FilesetAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 0f666652..f7bb5357 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,7 +7,7 @@ import fatcat_openapi_client from .common import EntityImporter, clean, make_rel_url -MAX_ABSTRACT_BYTES=4096 +MAX_ABSTRACT_BYTES = 4096 class GrobidMetadataImporter(EntityImporter): @@ -24,14 +24,13 @@ class GrobidMetadataImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Import of release and file metadata, as extracted from PDFs by GROBID.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Import of release and file metadata, as extracted from PDFs by GROBID.", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.GrobidMetadataImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") self.longtail_oa = kwargs.get("longtail_oa", False) @@ -40,7 +39,7 @@ class GrobidMetadataImporter(EntityImporter): def parse_record(self, row): - fields = row.split('\t') + fields = row.split("\t") sha1_key = fields[0] cdx = json.loads(fields[1]) mimetype = fields[2] @@ -65,8 +64,8 @@ class GrobidMetadataImporter(EntityImporter): # TODO: this is where we should check if the file actually has # release_ids and/or URLs associated with it if existing and not self.bezerk_mode: - self.counts['exists'] += 1 - self.counts['skip'] -= 1 + self.counts["exists"] += 1 + self.counts["skip"] -= 1 return None release_edit = self.create_release(re) @@ -75,75 +74,81 @@ class GrobidMetadataImporter(EntityImporter): def parse_grobid_json(self, obj): - if not obj.get('title'): + if not obj.get("title"): return None extra_grobid = dict() - abstract = obj.get('abstract') + abstract = obj.get("abstract") if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( - mimetype="text/plain", - content=clean(obj.get('abstract'))) + mimetype="text/plain", content=clean(obj.get("abstract")) + ) abstracts = [abobj] else: abstracts = None contribs = [] - for i, a in enumerate(obj.get('authors', [])): - contribs.append(fatcat_openapi_client.ReleaseContrib( - index=i, - raw_name=clean(a['name']), - given_name=clean(a.get('given_name')), - surname=clean(a.get('surname')), - role="author", - extra=None)) + for i, a in enumerate(obj.get("authors", [])): + contribs.append( + fatcat_openapi_client.ReleaseContrib( + index=i, + raw_name=clean(a["name"]), + given_name=clean(a.get("given_name")), + surname=clean(a.get("surname")), + role="author", + extra=None, + ) + ) refs = [] - for raw in obj.get('citations', []): + for raw in obj.get("citations", []): cite_extra = dict() year = None - if raw.get('date'): + if raw.get("date"): try: - year = int(raw['date'].strip()[:4]) + year = int(raw["date"].strip()[:4]) except (IndexError, ValueError): pass - for key in ('volume', 'url', 'issue', 'publisher'): + for key in ("volume", "url", "issue", "publisher"): if raw.get(key): cite_extra[key] = clean(raw[key]) - if raw.get('authors'): - cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] + if raw.get("authors"): + cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]] if not cite_extra: cite_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - key=clean(raw.get('id')), - year=year, - title=clean(raw['title']), - extra=cite_extra)) + refs.append( + fatcat_openapi_client.ReleaseRef( + key=clean(raw.get("id")), + year=year, + title=clean(raw["title"]), + extra=cite_extra, + ) + ) release_date = None release_year = None - if obj.get('date'): + if obj.get("date"): # only returns year, ever? - release_year = int(obj['date'][:4]) + release_year = int(obj["date"][:4]) extra = dict() - if obj.get('doi'): - extra['doi'] = obj['doi'] - if obj['journal'] and obj['journal'].get('name'): - extra['container_name'] = clean(obj['journal']['name']) + if obj.get("doi"): + extra["doi"] = obj["doi"] + if obj["journal"] and obj["journal"].get("name"): + extra["container_name"] = clean(obj["journal"]["name"]) # TODO: ISSN/eISSN handling? or just journal name lookup? if extra_grobid: - extra['grobid'] = extra_grobid + extra["grobid"] = extra_grobid if self.longtail_oa: - extra['longtail_oa'] = True + extra["longtail_oa"] = True if not extra: extra = None - title = clean(obj['title'], force_xml=True) + title = clean(obj["title"], force_xml=True) if not title or len(title) < 2: return None @@ -154,17 +159,22 @@ class GrobidMetadataImporter(EntityImporter): release_year=release_year, contribs=contribs, refs=refs, - publisher=clean(obj['journal'].get('publisher')), - volume=clean(obj['journal'].get('volume')), - issue=clean(obj['journal'].get('issue')), + publisher=clean(obj["journal"].get("publisher")), + volume=clean(obj["journal"].get("volume")), + issue=clean(obj["journal"].get("issue")), abstracts=abstracts, ext_ids=fatcat_openapi_client.ReleaseExtIds(), - extra=extra) + extra=extra, + ) return re def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): - sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() + sha1 = ( + base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", ""))) + .decode("ascii") + .lower() + ) fe = fatcat_openapi_client.FileEntity( sha1=sha1, @@ -175,16 +185,15 @@ class GrobidMetadataImporter(EntityImporter): ) # parse URLs and CDX - original = cdx['url'] - assert len(cdx['dt']) >= 8 - wayback = "https://web.archive.org/web/{}/{}".format( - cdx['dt'], - original) - fe.urls.append( - fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive")) + original = cdx["url"] + assert len(cdx["dt"]) >= 8 + wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original) + fe.urls.append(fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive")) original_url = make_rel_url(original, default_link_rel=self.default_link_rel) if original_url is not None: - fe.urls.append(fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1])) + fe.urls.append( + fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1]) + ) return fe @@ -193,8 +202,11 @@ class GrobidMetadataImporter(EntityImporter): return True def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index f0943c1e..e0a6c3f5 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,4 +1,3 @@ - import datetime import fatcat_openapi_client @@ -7,17 +6,16 @@ from .common import EntityImporter, make_rel_url class IngestFileResultImporter(EntityImporter): - def __init__(self, api, require_grobid=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Files crawled from web using sandcrawler ingest tool" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFileResultImporter") + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.use_glutton_match = False self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel @@ -27,20 +25,20 @@ class IngestFileResultImporter(EntityImporter): else: print("NOT checking GROBID success") self.ingest_request_source_allowlist = [ - 'fatcat-changelog', - 'fatcat-ingest-container', - 'fatcat-ingest', - 'arabesque', + "fatcat-changelog", + "fatcat-ingest-container", + "fatcat-ingest", + "arabesque", #'mag-corpus', #'mag', - 'unpaywall-corpus', - 'unpaywall', + "unpaywall-corpus", + "unpaywall", #'s2-corpus', #'s2', - 'doaj', - 'dblp', + "doaj", + "dblp", ] - if kwargs.get('skip_source_allowlist', False): + if kwargs.get("skip_source_allowlist", False): self.ingest_request_source_allowlist = [] def want_file(self, row) -> bool: @@ -48,28 +46,32 @@ class IngestFileResultImporter(EntityImporter): File-specific part of want(). Generic across general ingest and save-paper-now. """ - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + if not row.get("file_meta"): + self.counts["skip-file-meta"] += 1 return False # type-specific filters - if row['request'].get('ingest_type') == 'pdf': - if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: - self.counts['skip-grobid'] += 1 + if row["request"].get("ingest_type") == "pdf": + if self.require_grobid and row.get("grobid", {}).get("status_code") != 200: + self.counts["skip-grobid"] += 1 return False - if row['file_meta'].get('mimetype') not in ("application/pdf",): - self.counts['skip-mimetype'] += 1 + if row["file_meta"].get("mimetype") not in ("application/pdf",): + self.counts["skip-mimetype"] += 1 return False - elif row['request'].get('ingest_type') == 'xml': - if row['file_meta'].get('mimetype') not in ("application/xml", - "application/jats+xml", "application/tei+xml", "text/xml"): - self.counts['skip-mimetype'] += 1 + elif row["request"].get("ingest_type") == "xml": + if row["file_meta"].get("mimetype") not in ( + "application/xml", + "application/jats+xml", + "application/tei+xml", + "text/xml", + ): + self.counts["skip-mimetype"] += 1 return False - elif row['request'].get('ingest_type') in ['component', 'src', 'dataset-file']: + elif row["request"].get("ingest_type") in ["component", "src", "dataset-file"]: # we rely on sandcrawler for these checks pass else: - self.counts['skip-ingest-type'] += 1 + self.counts["skip-ingest-type"] += 1 return False return True @@ -79,24 +81,36 @@ class IngestFileResultImporter(EntityImporter): Sandcrawler ingest-specific part of want(). Generic across file and webcapture ingest. """ - if row.get('hit') is not True: - self.counts['skip-hit'] += 1 + if row.get("hit") is not True: + self.counts["skip-hit"] += 1 return False - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist: - self.counts['skip-ingest_request_source'] += 1 + if ( + self.ingest_request_source_allowlist + and source not in self.ingest_request_source_allowlist + ): + self.counts["skip-ingest_request_source"] += 1 return False - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'): - self.counts['skip-link-source'] += 1 + if row["request"].get("link_source") not in ( + "arxiv", + "pmc", + "unpaywall", + "doi", + "mag", + "s2", + "doaj", + "dblp", + ): + self.counts["skip-link-source"] += 1 return False - if source.startswith('savepapernow'): + if source.startswith("savepapernow"): # never process async savepapernow requests - self.counts['skip-savepapernow'] += 1 + self.counts["skip-savepapernow"] += 1 return False return True @@ -125,19 +139,19 @@ class IngestFileResultImporter(EntityImporter): def parse_ingest_release_ident(self, row): - request = row['request'] - fatcat = request.get('fatcat') + request = row["request"] + fatcat = request.get("fatcat") release_ident = None - if fatcat and fatcat.get('release_ident'): - release_ident = fatcat.get('release_ident') - elif request.get('ext_ids'): + if fatcat and fatcat.get("release_ident"): + release_ident = fatcat.get("release_ident") + elif request.get("ext_ids"): # if no fatcat ident, try extids - for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv', 'doaj', 'dblp'): - extid = request['ext_ids'].get(extid_type) + for extid_type in ("doi", "pmid", "pmcid", "arxiv", "doaj", "dblp"): + extid = request["ext_ids"].get(extid_type) if not extid: continue - if extid_type == 'doi': + if extid_type == "doi": extid = extid.lower() try: release = self.api.lookup_release(**{extid_type: extid}) @@ -145,66 +159,69 @@ class IngestFileResultImporter(EntityImporter): if err.status == 404: continue elif err.status == 400: - self.counts['warn-extid-invalid'] += 1 + self.counts["warn-extid-invalid"] += 1 continue raise err # verify release_stage - if request.get('release_stage') and release.release_stage: - if request['release_stage'] != release.release_stage: - self.counts['skip-release-stage'] += 1 + if request.get("release_stage") and release.release_stage: + if request["release_stage"] != release.release_stage: + self.counts["skip-release-stage"] += 1 return None release_ident = release.ident break - if self.use_glutton_match and not release_ident and row.get('grobid'): + if self.use_glutton_match and not release_ident and row.get("grobid"): # try biblio-glutton extracted hit - if row['grobid'].get('fatcat_release'): - release_ident = row['grobid']['fatcat_release'].split('_')[-1] - self.counts['glutton-match'] += 1 + if row["grobid"].get("fatcat_release"): + release_ident = row["grobid"]["fatcat_release"].split("_")[-1] + self.counts["glutton-match"] += 1 return release_ident def parse_terminal(self, row): - terminal = row.get('terminal') + terminal = row.get("terminal") if not terminal: # support old cdx-only ingest results - cdx = row.get('cdx') + cdx = row.get("cdx") if not cdx: return None else: terminal = { - 'terminal_url': cdx['url'], - 'terminal_dt': cdx['datetime'], - 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'), + "terminal_url": cdx["url"], + "terminal_dt": cdx["datetime"], + "terminal_status_code": cdx.get("status_code") or cdx.get("http_status"), } # work around old schema - if 'terminal_url' not in terminal: - terminal['terminal_url'] = terminal['url'] - if 'terminal_dt' not in terminal: - terminal['terminal_dt'] = terminal['dt'] + if "terminal_url" not in terminal: + terminal["terminal_url"] = terminal["url"] + if "terminal_dt" not in terminal: + terminal["terminal_dt"] = terminal["dt"] # convert CDX-style digits to ISO-style timestamp - assert len(terminal['terminal_dt']) == 14 - terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z" + assert len(terminal["terminal_dt"]) == 14 + terminal["terminal_timestamp"] = ( + datetime.datetime.strptime(terminal["terminal_dt"], "%Y%m%d%H%M%S").isoformat() + + "Z" + ) return terminal def parse_urls(self, row, terminal): - request = row['request'] + request = row["request"] default_rel = self.default_link_rel - if request.get('link_source') == 'doi': - default_rel = 'publisher' - default_rel = request.get('rel', default_rel) - url = make_rel_url(terminal['terminal_url'], default_rel) + if request.get("link_source") == "doi": + default_rel = "publisher" + default_rel = request.get("rel", default_rel) + url = make_rel_url(terminal["terminal_url"], default_rel) if not url: - self.counts['skip-url'] += 1 + self.counts["skip-url"] += 1 return None wayback = "https://web.archive.org/web/{}/{}".format( - terminal['terminal_dt'], - terminal['terminal_url']) + terminal["terminal_dt"], terminal["terminal_url"] + ) urls = [url, ("webarchive", wayback)] urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] @@ -212,38 +229,38 @@ class IngestFileResultImporter(EntityImporter): def parse_edit_extra(self, row): - request = row['request'] + request = row["request"] edit_extra = dict() - if request.get('edit_extra'): - edit_extra = request['edit_extra'] + if request.get("edit_extra"): + edit_extra = request["edit_extra"] - if request.get('ingest_request_source'): - edit_extra['ingest_request_source'] = request['ingest_request_source'] - if request.get('link_source') and request.get('link_source_id'): - edit_extra['link_source'] = request['link_source'] - edit_extra['link_source_id'] = request['link_source_id'] - if edit_extra['link_source'] == 'doi': - edit_extra['link_source_id'] = edit_extra['link_source_id'].lower() + if request.get("ingest_request_source"): + edit_extra["ingest_request_source"] = request["ingest_request_source"] + if request.get("link_source") and request.get("link_source_id"): + edit_extra["link_source"] = request["link_source"] + edit_extra["link_source_id"] = request["link_source_id"] + if edit_extra["link_source"] == "doi": + edit_extra["link_source_id"] = edit_extra["link_source_id"].lower() # GROBID metadata, for SPN requests (when there might not be 'success') - if request.get('ingest_type') == 'pdf': - if row.get('grobid') and row['grobid'].get('status') != 'success': - edit_extra['grobid_status_code'] = row['grobid']['status_code'] - edit_extra['grobid_version'] = row['grobid'].get('grobid_version') + if request.get("ingest_type") == "pdf": + if row.get("grobid") and row["grobid"].get("status") != "success": + edit_extra["grobid_status_code"] = row["grobid"]["status_code"] + edit_extra["grobid_version"] = row["grobid"].get("grobid_version") return edit_extra def parse_record(self, row): - request = row['request'] - file_meta = row['file_meta'] + request = row["request"] + file_meta = row["file_meta"] # double check that want() filtered request correctly (eg, old requests) - if request.get('ingest_type') not in ('pdf', 'xml'): - self.counts['skip-ingest-type'] += 1 + if request.get("ingest_type") not in ("pdf", "xml"): + self.counts["skip-ingest-type"] += 1 return None - assert (request['ingest_type'], file_meta['mimetype']) in [ + assert (request["ingest_type"], file_meta["mimetype"]) in [ ("pdf", "application/pdf"), ("xml", "application/xml"), ("xml", "application/jats+xml"), @@ -255,23 +272,23 @@ class IngestFileResultImporter(EntityImporter): release_ident = self.parse_ingest_release_ident(row) if not release_ident: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None terminal = self.parse_terminal(row) if not terminal: # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 + self.counts["skip-no-terminal"] += 1 return None urls = self.parse_urls(row, terminal) fe = fatcat_openapi_client.FileEntity( - md5=file_meta['md5hex'], - sha1=file_meta['sha1hex'], - sha256=file_meta['sha256hex'], - size=file_meta['size_bytes'], - mimetype=file_meta['mimetype'], + md5=file_meta["md5hex"], + sha1=file_meta["sha1hex"], + sha256=file_meta["sha256hex"], + size=file_meta["size_bytes"], + mimetype=file_meta["mimetype"], release_ids=[release_ident], urls=urls, ) @@ -293,7 +310,7 @@ class IngestFileResultImporter(EntityImporter): # check for existing edits-in-progress with same file hash for other in self._entity_queue: if other.sha1 == fe.sha1: - self.counts['skip-in-queue'] += 1 + self.counts["skip-in-queue"] += 1 return False if not existing: @@ -302,31 +319,36 @@ class IngestFileResultImporter(EntityImporter): # NOTE: the following checks all assume there is an existing item if (fe.release_ids[0] in existing.release_ids) and existing.urls: # TODO: could still, in theory update with the new URL? - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if not self.do_updates: - self.counts['skip-update-disabled'] += 1 + self.counts["skip-update-disabled"] += 1 return False # TODO: for now, never update - self.counts['skip-update-disabled'] += 1 + self.counts["skip-update-disabled"] += 1 return False def insert_batch(self, batch): if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + eg = self.api.create_editgroup( + fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) for fe in batch: self.api.create_file(eg.editgroup_id, fe) self.api.update_editgroup(eg.editgroup_id, eg, submit=True) else: - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) class SavePaperNowFileImporter(IngestFileResultImporter): @@ -338,29 +360,29 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def __init__(self, api, submit_mode=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFileImporter') - kwargs['submit_mode'] = submit_mode - kwargs['require_grobid'] = False - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Files crawled after a public 'Save Paper Now' request" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFileImporter") + kwargs["submit_mode"] = submit_mode + kwargs["require_grobid"] = False + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if not source.startswith('savepapernow'): - self.counts['skip-not-savepapernow'] += 1 + if not source.startswith("savepapernow"): + self.counts["skip-not-savepapernow"] += 1 return False - if row.get('hit') is not True: - self.counts['skip-hit'] += 1 + if row.get("hit") is not True: + self.counts["skip-hit"] += 1 return False if not self.want_file(row): @@ -377,14 +399,14 @@ class IngestWebResultImporter(IngestFileResultImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter') - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Webcaptures crawled from web using sandcrawler ingest tool" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestWebResultImporter") + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): @@ -392,91 +414,95 @@ class IngestWebResultImporter(IngestFileResultImporter): return False # webcapture-specific filters - if row['request'].get('ingest_type') != 'html': - self.counts['skip-ingest-type'] += 1 + if row["request"].get("ingest_type") != "html": + self.counts["skip-ingest-type"] += 1 return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + if not row.get("file_meta"): + self.counts["skip-file-meta"] += 1 return False - if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): - self.counts['skip-mimetype'] += 1 + if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"): + self.counts["skip-mimetype"] += 1 return False return True def parse_record(self, row): - request = row['request'] - file_meta = row['file_meta'] + request = row["request"] + file_meta = row["file_meta"] # double check that want() filtered request correctly (eg, old requests) - if request.get('ingest_type') != "html": - self.counts['skip-ingest-type'] += 1 + if request.get("ingest_type") != "html": + self.counts["skip-ingest-type"] += 1 return None - if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): - self.counts['skip-mimetype'] += 1 + if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"): + self.counts["skip-mimetype"] += 1 return None # identify release by fatcat ident, or extid lookup release_ident = self.parse_ingest_release_ident(row) if not release_ident: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None terminal = self.parse_terminal(row) if not terminal: # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 + self.counts["skip-no-terminal"] += 1 return None urls = self.parse_urls(row, terminal) - archive_urls = [u for u in urls if u.rel == 'webarchive'] + archive_urls = [u for u in urls if u.rel == "webarchive"] - if terminal['terminal_status_code'] != 200: - self.counts['skip-terminal-status-code'] += 1 + if terminal["terminal_status_code"] != 200: + self.counts["skip-terminal-status-code"] += 1 return None - terminal_cdx = row['cdx'] - if 'revisit_cdx' in row: - terminal_cdx = row['revisit_cdx'] - assert terminal_cdx['surt'] - if terminal_cdx['url'] != terminal['terminal_url']: - self.counts['skip-terminal-url-mismatch'] += 1 + terminal_cdx = row["cdx"] + if "revisit_cdx" in row: + terminal_cdx = row["revisit_cdx"] + assert terminal_cdx["surt"] + if terminal_cdx["url"] != terminal["terminal_url"]: + self.counts["skip-terminal-url-mismatch"] += 1 return None wc_cdx = [] # primary resource first - wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( - surt=terminal_cdx['surt'], - timestamp=terminal['terminal_timestamp'], - url=terminal['terminal_url'], - mimetype=file_meta['mimetype'], - status_code=terminal['terminal_status_code'], - sha1=file_meta['sha1hex'], - sha256=file_meta['sha256hex'], - size=file_meta['size_bytes'], - )) - - for resource in row.get('html_resources', []): - timestamp = resource['timestamp'] + wc_cdx.append( + fatcat_openapi_client.WebcaptureCdxLine( + surt=terminal_cdx["surt"], + timestamp=terminal["terminal_timestamp"], + url=terminal["terminal_url"], + mimetype=file_meta["mimetype"], + status_code=terminal["terminal_status_code"], + sha1=file_meta["sha1hex"], + sha256=file_meta["sha256hex"], + size=file_meta["size_bytes"], + ) + ) + + for resource in row.get("html_resources", []): + timestamp = resource["timestamp"] if "+" not in timestamp and "Z" not in timestamp: timestamp += "Z" - wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( - surt=resource['surt'], - timestamp=timestamp, - url=resource['url'], - mimetype=resource.get('mimetype'), - size=resource.get('size'), - sha1=resource.get('sha1hex'), - sha256=resource.get('sha256hex'), - )) + wc_cdx.append( + fatcat_openapi_client.WebcaptureCdxLine( + surt=resource["surt"], + timestamp=timestamp, + url=resource["url"], + mimetype=resource.get("mimetype"), + size=resource.get("size"), + sha1=resource.get("sha1hex"), + sha256=resource.get("sha256hex"), + ) + ) wc = fatcat_openapi_client.WebcaptureEntity( cdx=wc_cdx, archive_urls=archive_urls, - original_url=terminal['terminal_url'], - timestamp=terminal['terminal_timestamp'], + original_url=terminal["terminal_url"], + timestamp=terminal["terminal_timestamp"], release_ids=[release_ident], ) @@ -491,11 +517,11 @@ class IngestWebResultImporter(IngestFileResultImporter): # check for existing edits-in-progress with same URL for other in self._entity_queue: if other.original_url == wc.original_url: - self.counts['skip-in-queue'] += 1 + self.counts["skip-in-queue"] += 1 return False # lookup sha1, or create new entity (TODO: API doesn't support this yet) - #existing = None + # existing = None # TODO: currently only allow one release per webcapture release = self.api.get_release(wc.release_ids[0], expand="webcaptures") @@ -504,9 +530,9 @@ class IngestWebResultImporter(IngestFileResultImporter): for other in release.webcaptures: if wc.original_url == other.original_url: # TODO: compare very similar timestamps of same time (different formats) - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False - self.counts['skip-release-has-webcapture'] += 1 + self.counts["skip-release-has-webcapture"] += 1 return False # Ok, if we got here then no existing web capture for (first) release, @@ -515,18 +541,24 @@ class IngestWebResultImporter(IngestFileResultImporter): def insert_batch(self, batch): if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + eg = self.api.create_editgroup( + fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) for fe in batch: self.api.create_webcapture(eg.editgroup_id, fe) self.api.update_editgroup(eg.editgroup_id, eg, submit=True) else: - self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_webcapture_auto_batch( + fatcat_openapi_client.WebcaptureAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) + class SavePaperNowWebImporter(IngestWebResultImporter): """ @@ -535,15 +567,15 @@ class SavePaperNowWebImporter(IngestWebResultImporter): def __init__(self, api, submit_mode=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled after a public 'Save Paper Now' request" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowWebImporter') - kwargs['submit_mode'] = submit_mode - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Webcaptures crawled after a public 'Save Paper Now' request" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowWebImporter") + kwargs["submit_mode"] = submit_mode + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): """ @@ -553,27 +585,27 @@ class SavePaperNowWebImporter(IngestWebResultImporter): path, which means allowing hit=false. """ - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if not source.startswith('savepapernow'): - self.counts['skip-not-savepapernow'] += 1 + if not source.startswith("savepapernow"): + self.counts["skip-not-savepapernow"] += 1 return False # webcapture-specific filters - if row['request'].get('ingest_type') != 'html': - self.counts['skip-ingest-type'] += 1 + if row["request"].get("ingest_type") != "html": + self.counts["skip-ingest-type"] += 1 return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + if not row.get("file_meta"): + self.counts["skip-file-meta"] += 1 return False - if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): - self.counts['skip-mimetype'] += 1 + if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"): + self.counts["skip-mimetype"] += 1 return False - if row.get('status') not in ['success', 'unknown-scope']: - self.counts['skip-hit'] += 1 + if row.get("status") not in ["success", "unknown-scope"]: + self.counts["skip-hit"] += 1 return False return True @@ -587,28 +619,28 @@ class IngestFilesetResultImporter(IngestFileResultImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Filesets crawled from web using sandcrawler ingest tool" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFilesetResultImporter') - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Filesets crawled from web using sandcrawler ingest tool" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFilesetResultImporter") + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.max_file_count = 300 def want_fileset(self, row): - if not row.get('manifest') or len(row.get('manifest')) == 0: - self.counts['skip-empty-manifest'] += 1 + if not row.get("manifest") or len(row.get("manifest")) == 0: + self.counts["skip-empty-manifest"] += 1 return False - if len(row.get('manifest')) == 1: - self.counts['skip-single-file'] += 1 + if len(row.get("manifest")) == 1: + self.counts["skip-single-file"] += 1 return False - if len(row.get('manifest')) > self.max_file_count: - self.counts['skip-too-many-files'] += 1 + if len(row.get("manifest")) > self.max_file_count: + self.counts["skip-too-many-files"] += 1 return False return True @@ -619,8 +651,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return False # fileset-specific filters - if row['request'].get('ingest_type') not in ['dataset',]: - self.counts['skip-ingest-type'] += 1 + if row["request"].get("ingest_type") not in [ + "dataset", + ]: + self.counts["skip-ingest-type"] += 1 return False if not self.want_fileset(row): @@ -629,102 +663,118 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return True def parse_fileset_urls(self, row): - if not row.get('strategy'): + if not row.get("strategy"): return [] - strategy = row['strategy'] + strategy = row["strategy"] urls = [] - if strategy == 'archiveorg-fileset' and row.get('archiveorg_item_name'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://archive.org/download/{row['archiveorg_item_name']}/", - rel="archive-base", - )) - if row['strategy'].startswith('web-') and row.get('platform_base_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", - rel="webarchive-base", - )) + if strategy == "archiveorg-fileset" and row.get("archiveorg_item_name"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/", + rel="archive-base", + ) + ) + if row["strategy"].startswith("web-") and row.get("platform_base_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", + rel="webarchive-base", + ) + ) # TODO: repository-base # TODO: web-base - if row['strategy'] == 'archiveorg-fileset-bundle' and row.get('archiveorg_item_name'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}", - rel="archive-bundle", - )) + if row["strategy"] == "archiveorg-fileset-bundle" and row.get("archiveorg_item_name"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}", + rel="archive-bundle", + ) + ) - if row['strategy'] == 'web-fileset-bundle' and row.get('platform_bundle_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", - rel="webarchive-bundle", - )) + if row["strategy"] == "web-fileset-bundle" and row.get("platform_bundle_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", + rel="webarchive-bundle", + ) + ) # add any additional / platform URLs here - if row.get('platform_bundle_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=row['platform_bundle_url'], - rel="repository-bundle", - )) - if row.get('platform_base_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=row['platform_bundle_url'], - rel="repository-base", - )) + if row.get("platform_bundle_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=row["platform_bundle_url"], + rel="repository-bundle", + ) + ) + if row.get("platform_base_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=row["platform_bundle_url"], + rel="repository-base", + ) + ) return urls def parse_record(self, row): - request = row['request'] + request = row["request"] # double check that want() filtered request correctly - if request.get('ingest_type') not in ["dataset",]: - self.counts['skip-ingest-type'] += 1 + if request.get("ingest_type") not in [ + "dataset", + ]: + self.counts["skip-ingest-type"] += 1 return None # identify release by fatcat ident, or extid lookup release_ident = self.parse_ingest_release_ident(row) if not release_ident: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None entity_extra = dict() edit_extra = self.parse_edit_extra(row) - edit_extra['ingest_strategy'] = row['ingest_strategy'] - if row.get('platform'): - edit_extra['platform'] = row['platform'] - if row.get('platform_id'): - edit_extra['platform_id'] = row['platform_id'] + edit_extra["ingest_strategy"] = row["ingest_strategy"] + if row.get("platform"): + edit_extra["platform"] = row["platform"] + if row.get("platform_id"): + edit_extra["platform_id"] = row["platform_id"] entity_urls = self.parse_fileset_urls(row) if not entity_urls: - self.counts['skip-no-access-url'] += 1 + self.counts["skip-no-access-url"] += 1 return None - assert row['file_count'] == len(row['manifest']) - if row['file_count'] > self.max_file_count: - self.counts['skip-too-many-manifest-files'] += 1 + assert row["file_count"] == len(row["manifest"]) + if row["file_count"] > self.max_file_count: + self.counts["skip-too-many-manifest-files"] += 1 return None manifest = [] - for ingest_file in row['manifest']: + for ingest_file in row["manifest"]: fsf = fatcat_openapi_client.FilesetFile( - path=ingest_file['path'], - size=ingest_file['size'], - md5=ingest_file['md5'], - sha1=ingest_file['sha1'], - sha256=ingest_file.get('sha256'), + path=ingest_file["path"], + size=ingest_file["size"], + md5=ingest_file["md5"], + sha1=ingest_file["sha1"], + sha256=ingest_file.get("sha256"), extra=dict( - mimetype=ingest_file['mimetype'], + mimetype=ingest_file["mimetype"], ), ) if not (fsf.md5 and fsf.sha1 and fsf.path and fsf.size): - self.counts['skip-partial-file-info'] += 1 + self.counts["skip-partial-file-info"] += 1 return None - if ingest_file.get('platform_url'): + if ingest_file.get("platform_url"): # XXX: should we include this? - fsf.extra['original_url'] = ingest_file['platform_url'] - if ingest_file.get('terminal_url') and ingest_file.get('terminal_dt'): - fsf.extra['wayback_url'] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}" + fsf.extra["original_url"] = ingest_file["platform_url"] + if ingest_file.get("terminal_url") and ingest_file.get("terminal_dt"): + fsf.extra[ + "wayback_url" + ] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}" manifest.append(fsf) fe = fatcat_openapi_client.FilesetEntity( @@ -745,11 +795,11 @@ class IngestFilesetResultImporter(IngestFileResultImporter): for other in self._entity_queue: # XXX: how to duplicate check? if other.original_url == wc.original_url: - self.counts['skip-in-queue'] += 1 + self.counts["skip-in-queue"] += 1 return False # lookup sha1, or create new entity (TODO: API doesn't support this yet) - #existing = None + # existing = None # NOTE: in lieu of existing checks (by lookup), only allow one fileset per release release = self.api.get_release(wc.release_ids[0], expand="filesets") @@ -759,27 +809,32 @@ class IngestFilesetResultImporter(IngestFileResultImporter): for other in release.filesets: if wc.original_url == other.original_url: # TODO: compare very similar timestamps of same time (different formats) - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False - self.counts['skip-release-has-fileset'] += 1 + self.counts["skip-release-has-fileset"] += 1 return False return True def insert_batch(self, batch): if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + eg = self.api.create_editgroup( + fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) for fe in batch: self.api.create_fileset(eg.editgroup_id, fe) self.api.update_editgroup(eg.editgroup_id, eg, submit=True) else: - self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_fileset_auto_batch( + fatcat_openapi_client.FilesetAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) class SavePaperNowFilesetImporter(IngestFilesetResultImporter): @@ -789,28 +844,28 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter): def __init__(self, api, submit_mode=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Fileset crawled after a public 'Save Paper Now' request" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFilesetImporter') - kwargs['submit_mode'] = submit_mode - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Fileset crawled after a public 'Save Paper Now' request" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFilesetImporter") + kwargs["submit_mode"] = submit_mode + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if not source.startswith('savepapernow'): - self.counts['skip-not-savepapernow'] += 1 + if not source.startswith("savepapernow"): + self.counts["skip-not-savepapernow"] += 1 return False - if row.get('hit') is not True: - self.counts['skip-hit'] += 1 + if row.get("hit") is not True: + self.counts["skip-hit"] += 1 return False if not self.want_fileset(row): diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 0a983c5e..8e3af416 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,4 +1,3 @@ - import datetime import sqlite3 import sys @@ -33,26 +32,24 @@ def parse_jalc_persons(raw_persons): # first parse out into language-agnostic dics for raw in raw_persons: - name = raw.find('name') or None + name = raw.find("name") or None if name: - name = clean(name.get_text().replace('\n', ' ')) - surname = raw.find('familyName') or None + name = clean(name.get_text().replace("\n", " ")) + surname = raw.find("familyName") or None if surname: - surname = clean(surname.get_text().replace('\n', ' ')) - given_name = raw.find('givenName') or None + surname = clean(surname.get_text().replace("\n", " ")) + given_name = raw.find("givenName") or None if given_name: - given_name = clean(given_name.get_text().replace('\n', ' ')) - lang = 'en' + given_name = clean(given_name.get_text().replace("\n", " ")) + lang = "en" if is_cjk(name): - lang = 'ja' - if lang == 'en' and surname and given_name: + lang = "ja" + if lang == "en" and surname and given_name: # english names order is flipped name = "{} {}".format(given_name, surname) rc = fatcat_openapi_client.ReleaseContrib( - raw_name=name, - surname=surname, - given_name=given_name, - role="author") + raw_name=name, surname=surname, given_name=given_name, role="author" + ) # add an extra hint field; won't end up in serialized object rc._lang = lang persons.append(rc) @@ -60,12 +57,12 @@ def parse_jalc_persons(raw_persons): if not persons: return [] - if all([p._lang == 'en' for p in persons]) or all([p._lang == 'ja' for p in persons]): + if all([p._lang == "en" for p in persons]) or all([p._lang == "ja" for p in persons]): # all english names, or all japanese names return persons # for debugging - #if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']): + # if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']): # print("INTERESTING: {}".format(persons[0])) start_lang = persons[0]._lang @@ -74,10 +71,10 @@ def parse_jalc_persons(raw_persons): if p._lang == start_lang: contribs.append(p) else: - if p._lang == 'en' and contribs[-1]._lang == 'ja': + if p._lang == "en" and contribs[-1]._lang == "ja": eng = p jpn = contribs[-1] - elif p._lang == 'ja' and contribs[-1]._lang == 'en': + elif p._lang == "ja" and contribs[-1]._lang == "en": eng = contribs[-1] jpn = p else: @@ -85,11 +82,11 @@ def parse_jalc_persons(raw_persons): contribs.append(p) continue eng.extra = { - 'original_name': { - 'lang': jpn._lang, - 'raw_name': jpn.raw_name, - 'given_name': jpn.given_name, - 'surname': jpn.surname, + "original_name": { + "lang": jpn._lang, + "raw_name": jpn.raw_name, + "given_name": jpn.given_name, + "surname": jpn.surname, }, } contribs[-1] = eng @@ -105,18 +102,19 @@ class JalcImporter(EntityImporter): def __init__(self, api, issn_map_file, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of JALC DOI metadata") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter') - super().__init__(api, + eg_desc = kwargs.get("editgroup_description", "Automated import of JALC DOI metadata") + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JalcImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) - self.create_containers = kwargs.get('create_containers', True) - extid_map_file = kwargs.get('extid_map_file') + self.create_containers = kwargs.get("create_containers", True) + extid_map_file = kwargs.get("extid_map_file") self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -129,12 +127,27 @@ class JalcImporter(EntityImporter): def lookup_ext_ids(self, doi): if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", - [doi.lower()]).fetchone() + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = self.extid_map_db.execute( + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] + ).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = [str(cell or '') or None for cell in row] + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = [str(cell or "") or None for cell in row] return dict( core_id=row[0], pmid=row[1], @@ -163,27 +176,27 @@ class JalcImporter(EntityImporter): titles = record.find_all("title") if not titles: return None - title = titles[0].get_text().replace('\n', ' ').strip() + title = titles[0].get_text().replace("\n", " ").strip() original_title = None - if title.endswith('.'): + if title.endswith("."): title = title[:-1] if len(titles) > 1: - original_title = titles[1].get_text().replace('\n', ' ').strip() - if original_title.endswith('.'): + original_title = titles[1].get_text().replace("\n", " ").strip() + if original_title.endswith("."): original_title = original_title[:-1] doi = None if record.doi: doi = clean_doi(record.doi.string.strip().lower()) - if doi.startswith('http://dx.doi.org/'): - doi = doi.replace('http://dx.doi.org/', '') - elif doi.startswith('https://dx.doi.org/'): - doi = doi.replace('https://dx.doi.org/', '') - elif doi.startswith('http://doi.org/'): - doi = doi.replace('http://doi.org/', '') - elif doi.startswith('https://doi.org/'): - doi = doi.replace('https://doi.org/', '') - if not (doi.startswith('10.') and '/' in doi): + if doi.startswith("http://dx.doi.org/"): + doi = doi.replace("http://dx.doi.org/", "") + elif doi.startswith("https://dx.doi.org/"): + doi = doi.replace("https://dx.doi.org/", "") + elif doi.startswith("http://doi.org/"): + doi = doi.replace("http://doi.org/", "") + elif doi.startswith("https://doi.org/"): + doi = doi.replace("https://doi.org/", "") + if not (doi.startswith("10.") and "/" in doi): sys.stderr.write("bogus JALC DOI: {}\n".format(doi)) doi = None if not doi: @@ -202,7 +215,9 @@ class JalcImporter(EntityImporter): if date: date = date.string if len(date) == 10: - release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date() + release_date = datetime.datetime.strptime( + date["completed-date"], DATE_FMT + ).date() release_year = release_date.year release_date = release_date.isoformat() elif len(date) == 4 and date.isdigit(): @@ -214,7 +229,7 @@ class JalcImporter(EntityImporter): if record.endingPage and record.endingPage.string.strip(): pages = "{}-{}".format(pages, record.endingPage.string.strip()) # double check to prevent "-" as pages - if pages and pages.strip() == '-': + if pages and pages.strip() == "-": pages = None volume = None @@ -242,9 +257,13 @@ class JalcImporter(EntityImporter): container_extra = dict() if record.publicationName: - pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()] + pubs = [ + p.get_text().replace("\n", " ").strip() + for p in record.find_all("publicationName") + if p.get_text() + ] pubs = [clean(p) for p in pubs if p] - assert(pubs) + assert pubs if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): @@ -252,10 +271,14 @@ class JalcImporter(EntityImporter): pubs = [pubs[1], pubs[0]] container_name = clean(pubs[0]) if len(pubs) > 1: - container_extra['original_name'] = clean(pubs[1]) + container_extra["original_name"] = clean(pubs[1]) if record.publisher: - pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()] + pubs = [ + p.get_text().replace("\n", " ").strip() + for p in record.find_all("publisher") + if p.get_text() + ] pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] @@ -265,20 +288,25 @@ class JalcImporter(EntityImporter): if pubs: publisher = clean(pubs[0]) if len(pubs) > 1: - container_extra['publisher_aliases'] = pubs[1:] - - if (container_id is None and self.create_containers and (issnl is not None) - and container_name): + container_extra["publisher_aliases"] = pubs[1:] + + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and container_name + ): # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country - container_extra['country'] = 'jp' - container_extra['languages'] = ['ja'] + container_extra["country"] = "jp" + container_extra["languages"] = ["ja"] ce = fatcat_openapi_client.ContainerEntity( name=container_name, - container_type='journal', + container_type="journal", publisher=publisher, issnl=issnl, - extra=(container_extra or None)) + extra=(container_extra or None), + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident # short-cut future imports in same batch @@ -301,7 +329,7 @@ class JalcImporter(EntityImporter): # group-title # always put at least an empty dict here to indicate the DOI registrar # (informally) - extra['jalc'] = extra_jalc + extra["jalc"] = extra_jalc title = clean(title) if not title: @@ -312,24 +340,24 @@ class JalcImporter(EntityImporter): title=title, original_title=clean(original_title), release_type=release_type, - release_stage='published', + release_stage="published", release_date=release_date, release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids['pmid'], - pmcid=extids['pmcid'], - wikidata_qid=extids['wikidata_qid'], - core=extids['core_id'], - arxiv=extids['arxiv_id'], - jstor=extids['jstor_id'], + pmid=extids["pmid"], + pmcid=extids["pmcid"], + wikidata_qid=extids["wikidata_qid"], + core=extids["core_id"], + arxiv=extids["arxiv_id"], + jstor=extids["jstor_id"], ), volume=volume, issue=issue, pages=pages, publisher=publisher, language=lang, - #license_slug + # license_slug container_id=container_id, contribs=contribs, extra=extra, @@ -351,17 +379,20 @@ class JalcImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def parse_file(self, handle): """ @@ -374,11 +405,11 @@ class JalcImporter(EntityImporter): # 2. iterate over articles, call parse_article on each for record in soup.find_all("Description"): resp = self.parse_record(record) - #print(json.dumps(resp)) + # print(json.dumps(resp)) print(resp) - #sys.exit(-1) + # sys.exit(-1) -if __name__=='__main__': +if __name__ == "__main__": parser = JalcImporter(None, None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index 25d7b3b5..6d1fefa3 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from .common import EntityImporter, clean @@ -11,18 +10,20 @@ def or_none(s): return None return s + def truthy(s): if s is None: return None s = s.lower() - if s in ('true', 't', 'yes', 'y', '1'): + if s in ("true", "t", "yes", "y", "1"): return True - elif s in ('false', 'f', 'no', 'n', '0'): + elif s in ("false", "f", "no", "n", "0"): return False else: return None + class JournalMetadataImporter(EntityImporter): """ Imports journal metadata ("containers") by ISSN, currently from a custom @@ -33,17 +34,16 @@ class JournalMetadataImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JournalMetadataImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, raw_record): - if raw_record.get('issnl') and raw_record.get('name'): + if raw_record.get("issnl") and raw_record.get("name"): return True return False @@ -54,52 +54,68 @@ class JournalMetadataImporter(EntityImporter): returns a ContainerEntity (or None if invalid or couldn't parse) """ - if not row.get('name'): + if not row.get("name"): # Name is required (by schema) return None extra = dict() - for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev', - 'coden', 'aliases', 'original_name', 'first_year', 'last_year', - 'platform', 'default_license', 'road', 'mimetypes', - 'sherpa_romeo', 'kbart'): + for key in ( + "issne", + "issnp", + "languages", + "country", + "urls", + "abbrev", + "coden", + "aliases", + "original_name", + "first_year", + "last_year", + "platform", + "default_license", + "road", + "mimetypes", + "sherpa_romeo", + "kbart", + ): if row.get(key): extra[key] = row[key] # TODO: not including for now: norwegian, dois/crossref, ia extra_doaj = dict() - if row.get('doaj'): - if row['doaj'].get('as_of'): - extra_doaj['as_of'] = row['doaj']['as_of'] - if row['doaj'].get('works'): - extra_doaj['works'] = row['doaj']['works'] + if row.get("doaj"): + if row["doaj"].get("as_of"): + extra_doaj["as_of"] = row["doaj"]["as_of"] + if row["doaj"].get("works"): + extra_doaj["works"] = row["doaj"]["works"] if extra_doaj: - extra['doaj'] = extra_doaj + extra["doaj"] = extra_doaj extra_ia = dict() # TODO: would like an ia.longtail_ia flag - if row.get('sim'): + if row.get("sim"): # NB: None case of the .get() here is blech, but othrwise # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on - extra_ia['sim'] = { - 'year_spans': row['sim'].get('year_spans'), + extra_ia["sim"] = { + "year_spans": row["sim"].get("year_spans"), } if extra_ia: - extra['ia'] = extra_ia + extra["ia"] = extra_ia - name = clean(row.get('name')) + name = clean(row.get("name")) if not name: return None ce = fatcat_openapi_client.ContainerEntity( - issnl=row['issnl'], - issne=row.get('issne'), - issnp=row.get('issnp'), - container_type=None, # TODO + issnl=row["issnl"], + issne=row.get("issne"), + issnp=row.get("issnp"), + container_type=None, # TODO name=name, - publisher=clean(row.get('publisher')), - wikidata_qid=None, # TODO - extra=extra) + publisher=clean(row.get("publisher")), + wikidata_qid=None, # TODO + extra=extra, + ) return ce def try_update(self, ce): @@ -118,23 +134,26 @@ class JournalMetadataImporter(EntityImporter): # for now, only update KBART, and only if there is new content if not existing.extra: existing.extra = dict() - if ce.extra.get('kbart') and (existing.extra.get('kbart') != ce.extra['kbart']): - if not existing.extra.get('kbart'): - existing.extra['kbart'] = {} - existing.extra['kbart'].update(ce.extra['kbart']) + if ce.extra.get("kbart") and (existing.extra.get("kbart") != ce.extra["kbart"]): + if not existing.extra.get("kbart"): + existing.extra["kbart"] = {} + existing.extra["kbart"].update(ce.extra["kbart"]) self.api.update_container(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 return False else: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # if we got this far, it's a bug raise NotImplementedError def insert_batch(self, batch): - self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_container_auto_batch( + fatcat_openapi_client.ContainerAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index d37424d6..8c7bfad4 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -1,4 +1,3 @@ - import datetime import json import sys @@ -12,10 +11,10 @@ from .crossref import CONTAINER_TYPE_MAP # TODO: more entries? JSTOR_CONTRIB_MAP = { - 'author': 'author', - 'editor': 'editor', - 'translator': 'translator', - 'illustrator': 'illustrator', + "author": "author", + "editor": "editor", + "translator": "translator", + "illustrator": "illustrator", } JSTOR_TYPE_MAP = { @@ -26,6 +25,7 @@ JSTOR_TYPE_MAP = { "research-article": "article-journal", } + class JstorImporter(EntityImporter): """ Importer for JSTOR bulk XML metadata (eg, from their Early Journals @@ -34,17 +34,18 @@ class JstorImporter(EntityImporter): def __init__(self, api, issn_map_file, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of JSTOR XML metadata") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter') - super().__init__(api, + eg_desc = kwargs.get("editgroup_description", "Automated import of JSTOR XML metadata") + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JstorImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) - self.create_containers = kwargs.get('create_containers', True) + self.create_containers = kwargs.get("create_containers", True) self.read_issn_map_file(issn_map_file) @@ -62,20 +63,22 @@ class JstorImporter(EntityImporter): extra = dict() extra_jstor = dict() - release_type = JSTOR_TYPE_MAP.get(article['article-type']) + release_type = JSTOR_TYPE_MAP.get(article["article-type"]) title = article_meta.find("article-title") if title and title.get_text(): - title = title.get_text().replace('\n', ' ').strip() + title = title.get_text().replace("\n", " ").strip() elif title and not title.get_text(): title = None - if not title and release_type.startswith('review') and article_meta.product.source: - title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text()) + if not title and release_type.startswith("review") and article_meta.product.source: + title = "Review: {}".format( + article_meta.product.source.replace("\n", " ").get_text() + ) if not title: return None - if title.endswith('.'): + if title.endswith("."): title = title[:-1] if "[Abstract]" in title: @@ -93,12 +96,12 @@ class JstorImporter(EntityImporter): title = title[1:-1] # JSTOR journal-id - journal_ids = [j.string for j in journal_meta.find_all('journal-id')] + journal_ids = [j.string for j in journal_meta.find_all("journal-id")] if journal_ids: - extra_jstor['journal_ids'] = journal_ids + extra_jstor["journal_ids"] = journal_ids - journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ') - publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ') + journal_title = journal_meta.find("journal-title").get_text().replace("\n", " ") + publisher = journal_meta.find("publisher-name").get_text().replace("\n", " ") issn = journal_meta.find("issn") if issn: issn = issn.string @@ -113,13 +116,18 @@ class JstorImporter(EntityImporter): container_id = self.lookup_issnl(issnl) # create container if it doesn't exist - if (container_id is None and self.create_containers and (issnl is not None) - and journal_title): + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and journal_title + ): ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), - name=clean(journal_title, force_xml=True)) + name=clean(journal_title, force_xml=True), + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id @@ -132,8 +140,8 @@ class JstorImporter(EntityImporter): if jstor_id: jstor_id = jstor_id.string.strip() if not jstor_id and doi: - assert doi.startswith('10.2307/') - jstor_id = doi.replace('10.2307/', '') + assert doi.startswith("10.2307/") + jstor_id = doi.replace("10.2307/", "") assert jstor_id and int(jstor_id) contribs = [] @@ -142,13 +150,13 @@ class JstorImporter(EntityImporter): for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: - given = clean(given.get_text().replace('\n', ' ')) + given = clean(given.get_text().replace("\n", " ")) surname = c.find("surname") if surname: - surname = clean(surname.get_text().replace('\n', ' ')) + surname = clean(surname.get_text().replace("\n", " ")) raw_name = c.find("string-name") if raw_name: - raw_name = clean(raw_name.get_text().replace('\n', ' ')) + raw_name = clean(raw_name.get_text().replace("\n", " ")) if not raw_name: if given and surname: @@ -156,15 +164,17 @@ class JstorImporter(EntityImporter): elif surname: raw_name = surname - role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author')) - if not role and c.get('contrib-type'): - sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c['contrib-type'])) - contribs.append(fatcat_openapi_client.ReleaseContrib( - role=role, - raw_name=raw_name, - given_name=given, - surname=surname, - )) + role = JSTOR_CONTRIB_MAP.get(c.get("contrib-type", "author")) + if not role and c.get("contrib-type"): + sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c["contrib-type"])) + contribs.append( + fatcat_openapi_client.ReleaseContrib( + role=role, + raw_name=raw_name, + given_name=given, + surname=surname, + ) + ) for i, contrib in enumerate(contribs): if contrib.raw_name != "et al.": @@ -172,14 +182,13 @@ class JstorImporter(EntityImporter): release_year = None release_date = None - pub_date = article_meta.find('pub-date') + pub_date = article_meta.find("pub-date") if pub_date and pub_date.year: release_year = int(pub_date.year.string) if pub_date.month and pub_date.day: release_date = datetime.date( - release_year, - int(pub_date.month.string), - int(pub_date.day.string)) + release_year, int(pub_date.month.string), int(pub_date.day.string) + ) if release_date.day == 1 and release_date.month == 1: # suspect jan 1st dates get set by JSTOR when actual # date not known (citation needed), so drop them @@ -208,10 +217,10 @@ class JstorImporter(EntityImporter): warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) # JSTOR issue-id - if article_meta.find('issue-id'): - issue_id = clean(article_meta.find('issue-id').string) + if article_meta.find("issue-id"): + issue_id = clean(article_meta.find("issue-id").string) if issue_id: - extra_jstor['issue_id'] = issue_id + extra_jstor["issue_id"] = issue_id # everything in JSTOR is published release_stage = "published" @@ -225,14 +234,14 @@ class JstorImporter(EntityImporter): # group-title # pubmed: retraction refs if extra_jstor: - extra['jstor'] = extra_jstor + extra["jstor"] = extra_jstor if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( - #work_id + # work_id title=title, - #original_title + # original_title release_type=release_type, release_stage=release_stage, release_date=release_date, @@ -246,21 +255,16 @@ class JstorImporter(EntityImporter): pages=pages, publisher=publisher, language=language, - #license_slug - + # license_slug # content, mimetype, lang - #abstracts=abstracts, - + # abstracts=abstracts, contribs=contribs, - # key, year, container_name, title, locator # extra: volume, authors, issue, publisher, identifiers - #refs=refs, - + # refs=refs, # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country container_id=container_id, - extra=extra, ) return re @@ -289,12 +293,12 @@ class JstorImporter(EntityImporter): if existing and existing.ext_ids.jstor: # don't update if it already has JSTOR ID - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False elif existing: # but do update if only DOI was set existing.ext_ids.jstor = re.ext_ids.jstor - existing.extra['jstor'] = re.extra['jstor'] + existing.extra["jstor"] = re.extra["jstor"] # better release_type detection, and some other fields # TODO: don't do this over-writing in the future? assuming here # this is a one-time batch import over/extending bootstrap crossref @@ -304,17 +308,20 @@ class JstorImporter(EntityImporter): existing.contribs = re.contribs existing.language = re.language self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 return False return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def parse_file(self, handle): @@ -325,8 +332,9 @@ class JstorImporter(EntityImporter): for article in soup.find_all("article"): resp = self.parse_record(article) print(json.dumps(resp)) - #sys.exit(-1) + # sys.exit(-1) + -if __name__=='__main__': +if __name__ == "__main__": parser = JstorImporter(None, None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 09807276..7c2a6a87 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from fatcat_tools.normal import clean_doi @@ -32,13 +31,13 @@ class MatchedImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Import of large-scale file-to-release match results. Source of metadata varies." - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Import of large-scale file-to-release match results. Source of metadata varies." + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.MatchedImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") self.default_mimetype = kwargs.get("default_mimetype", None) @@ -46,14 +45,14 @@ class MatchedImporter(EntityImporter): return True def parse_record(self, obj): - dois = [d.lower() for d in obj.get('dois', [])] + dois = [d.lower() for d in obj.get("dois", [])] # lookup dois re_list = set() for doi in dois: doi = clean_doi(doi) if not doi: - self.counts['skip-bad-doi'] += 1 + self.counts["skip-bad-doi"] += 1 return None try: re = self.api.lookup_release(doi=doi) @@ -62,13 +61,22 @@ class MatchedImporter(EntityImporter): raise err re = None if re is None: - #print("DOI not found: {}".format(doi)) + # print("DOI not found: {}".format(doi)) pass else: re_list.add(re.ident) # look up other external ids - for extid_type in ('arxiv', 'pmid', 'pmcid', 'jstor', 'wikidata_qid', 'core', 'isbn13', 'ark'): + for extid_type in ( + "arxiv", + "pmid", + "pmcid", + "jstor", + "wikidata_qid", + "core", + "isbn13", + "ark", + ): extid = obj.get(extid_type) if extid: try: @@ -84,49 +92,47 @@ class MatchedImporter(EntityImporter): release_ids = list(re_list) if len(release_ids) == 0: - self.counts['skip-no-releases'] += 1 + self.counts["skip-no-releases"] += 1 return None if len(release_ids) > SANE_MAX_RELEASES: - self.counts['skip-too-many-releases'] += 1 + self.counts["skip-too-many-releases"] += 1 return None # parse URLs and CDX urls = set() - for url in obj.get('urls', []): + for url in obj.get("urls", []): url = make_rel_url(url, default_link_rel=self.default_link_rel) if url is not None: urls.add(url) - for cdx in obj.get('cdx', []): - original = cdx['url'] - if cdx.get('dt'): - wayback = "https://web.archive.org/web/{}/{}".format( - cdx['dt'], - original) + for cdx in obj.get("cdx", []): + original = cdx["url"] + if cdx.get("dt"): + wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original) urls.add(("webarchive", wayback)) url = make_rel_url(original, default_link_rel=self.default_link_rel) if url is not None: urls.add(url) urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] if len(urls) == 0: - self.counts['skip-no-urls'] += 1 + self.counts["skip-no-urls"] += 1 return None if len(urls) > SANE_MAX_URLS: - self.counts['skip-too-many-urls'] += 1 + self.counts["skip-too-many-urls"] += 1 return None - size = obj.get('size') + size = obj.get("size") if size: size = int(size) - mimetype = obj.get('mimetype', self.default_mimetype) + mimetype = obj.get("mimetype", self.default_mimetype) if not mimetype and urls: - if urls[0].url.endswith('.pdf'): - mimetype = 'application/pdf' + if urls[0].url.endswith(".pdf"): + mimetype = "application/pdf" fe = fatcat_openapi_client.FileEntity( - md5=obj.get('md5'), - sha1=obj['sha1'], - sha256=obj.get('sha256'), + md5=obj.get("md5"), + sha1=obj["sha1"], + sha256=obj.get("sha256"), size=size, mimetype=mimetype, release_ids=release_ids, @@ -149,28 +155,30 @@ class MatchedImporter(EntityImporter): combined_release_ids = list(set(fe.release_ids + existing.release_ids)) if set(combined_release_ids) == set(existing.release_ids) and len(existing.urls) > 0: # no new release matches *and* there are already existing URLs - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # check for edit conflicts if existing.ident in [e.ident for e in self._edits_inflight]: - self.counts['skip-update-inflight'] += 1 + self.counts["skip-update-inflight"] += 1 return False # minimum viable "existing" URL cleanup to fix dupes and broken links: # remove 'None' wayback URLs, and set archive.org rel 'archive' - existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] + existing.urls = [ + u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url) + ] for i in range(len(existing.urls)): u = existing.urls[i] - if u.rel == 'repository' and '://archive.org/download/' in u.url: - existing.urls[i].rel = 'archive' + if u.rel == "repository" and "://archive.org/download/" in u.url: + existing.urls[i].rel = "archive" # special case: if importing *new* from archive.org arxiv collections, # blow away any existing release_id mappings; this is a direct arxiv_id # map. This *should* be safe to run in all matched imports. is_arxiv = False for u in fe.urls: - if 'archive.org/download/arxiv' in u.url.lower(): + if "archive.org/download/arxiv" in u.url.lower(): is_arxiv = True break if is_arxiv and fe.release_ids: @@ -178,14 +186,16 @@ class MatchedImporter(EntityImporter): # merge the existing into this one and update existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) - existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] + existing.urls = [ + fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls + ] if len(existing.urls) > SANE_MAX_URLS: - self.counts['skip-update-too-many-url'] += 1 + self.counts["skip-update-too-many-url"] += 1 return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES: - self.counts['skip-update-too-many-releases'] += 1 + self.counts["skip-update-too-many-releases"] += 1 return None existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size @@ -194,12 +204,15 @@ class MatchedImporter(EntityImporter): existing.sha256 = existing.sha256 or fe.sha256 edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self._edits_inflight.append(edit) - self.counts['update'] += 1 + self.counts["update"] += 1 return False def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 3bdd23a1..b514e6e5 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,4 +1,3 @@ - import sys import fatcat_openapi_client @@ -8,7 +7,7 @@ from .common import EntityImporter, clean def value_or_none(e): if type(e) == dict: - e = e.get('value') + e = e.get("value") if type(e) == str and len(e) == 0: e = None # TODO: this is probably bogus; patched in desperation; remove? @@ -21,18 +20,17 @@ def value_or_none(e): return None return e -class OrcidImporter(EntityImporter): +class OrcidImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of ORCID metadata, from official bulk releases.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of ORCID metadata, from official bulk releases.", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.OrcidImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, raw_record): return True @@ -43,16 +41,16 @@ class OrcidImporter(EntityImporter): returns a CreatorEntity """ - if 'person' not in obj: + if "person" not in obj: return False - name = obj['person']['name'] + name = obj["person"]["name"] if not name: return None extra = None - given = value_or_none(name.get('given-names')) - sur = value_or_none(name.get('family-name')) - display = value_or_none(name.get('credit-name')) + given = value_or_none(name.get("given-names")) + sur = value_or_none(name.get("family-name")) + display = value_or_none(name.get("credit-name")) if display is None: # TODO: sorry human beings if given and sur: @@ -61,7 +59,7 @@ class OrcidImporter(EntityImporter): display = sur elif given: display = given - orcid = obj['orcid-identifier']['path'] + orcid = obj["orcid-identifier"]["path"] if not self.is_orcid(orcid): sys.stderr.write("Bad ORCID: {}\n".format(orcid)) return None @@ -74,7 +72,8 @@ class OrcidImporter(EntityImporter): given_name=clean(given), surname=clean(sur), display_name=display, - extra=extra) + extra=extra, + ) return ce def try_update(self, raw_record): @@ -88,14 +87,17 @@ class OrcidImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - self.api.create_creator_auto_batch(fatcat_openapi_client.CreatorAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_creator_auto_batch( + fatcat_openapi_client.CreatorAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 00ad54d0..cfdafcf7 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -1,4 +1,3 @@ - import datetime import json import sys @@ -13,42 +12,42 @@ from .common import LANG_MAP_MARC, EntityImporter, clean # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly PUBMED_RELEASE_TYPE_MAP = { - #Adaptive Clinical Trial + # Adaptive Clinical Trial "Address": "speech", "Autobiography": "book", - #Bibliography + # Bibliography "Biography": "book", - #Case Reports + # Case Reports "Classical Article": "article-journal", - #Clinical Conference - #Clinical Study - #Clinical Trial - #Clinical Trial, Phase I - #Clinical Trial, Phase II - #Clinical Trial, Phase III - #Clinical Trial, Phase IV - #Clinical Trial Protocol - #Clinical Trial, Veterinary - #Collected Works - #Comparative Study - #Congress - #Consensus Development Conference - #Consensus Development Conference, NIH - #Controlled Clinical Trial + # Clinical Conference + # Clinical Study + # Clinical Trial + # Clinical Trial, Phase I + # Clinical Trial, Phase II + # Clinical Trial, Phase III + # Clinical Trial, Phase IV + # Clinical Trial Protocol + # Clinical Trial, Veterinary + # Collected Works + # Comparative Study + # Congress + # Consensus Development Conference + # Consensus Development Conference, NIH + # Controlled Clinical Trial "Dataset": "dataset", - #Dictionary - #Directory - #Duplicate Publication + # Dictionary + # Directory + # Duplicate Publication "Editorial": "editorial", - #English Abstract # doesn't indicate that this is abstract-only - #Equivalence Trial - #Evaluation Studies - #Expression of Concern - #Festschrift - #Government Document - #Guideline + # English Abstract # doesn't indicate that this is abstract-only + # Equivalence Trial + # Evaluation Studies + # Expression of Concern + # Festschrift + # Government Document + # Guideline "Historical Article": "article-journal", - #Interactive Tutorial + # Interactive Tutorial "Interview": "interview", "Introductory Journal Article": "article-journal", "Journal Article": "article-journal", @@ -56,53 +55,65 @@ PUBMED_RELEASE_TYPE_MAP = { "Legal Case": "legal_case", "Legislation": "legislation", "Letter": "letter", - #Meta-Analysis - #Multicenter Study - #News + # Meta-Analysis + # Multicenter Study + # News "Newspaper Article": "article-newspaper", - #Observational Study - #Observational Study, Veterinary - #Overall - #Patient Education Handout - #Periodical Index - #Personal Narrative - #Portrait - #Practice Guideline - #Pragmatic Clinical Trial - #Publication Components - #Publication Formats - #Publication Type Category - #Randomized Controlled Trial - #Research Support, American Recovery and Reinvestment Act - #Research Support, N.I.H., Extramural - #Research Support, N.I.H., Intramural - #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. - #Research Support, U.S. Gov't, P.H.S. - #Review # in the "literature review" sense, not "product review" - #Scientific Integrity Review - #Study Characteristics - #Support of Research - #Systematic Review + # Observational Study + # Observational Study, Veterinary + # Overall + # Patient Education Handout + # Periodical Index + # Personal Narrative + # Portrait + # Practice Guideline + # Pragmatic Clinical Trial + # Publication Components + # Publication Formats + # Publication Type Category + # Randomized Controlled Trial + # Research Support, American Recovery and Reinvestment Act + # Research Support, N.I.H., Extramural + # Research Support, N.I.H., Intramural + # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. + # Research Support, U.S. Gov't, P.H.S. + # Review # in the "literature review" sense, not "product review" + # Scientific Integrity Review + # Study Characteristics + # Support of Research + # Systematic Review "Technical Report": "report", - #Twin Study - #Validation Studies - #Video-Audio Media - #Webcasts + # Twin Study + # Validation Studies + # Video-Audio Media + # Webcasts } MONTH_ABBR_MAP = { - "Jan": 1, "01": 1, - "Feb": 2, "02": 2, - "Mar": 3, "03": 3, - "Apr": 4, "04": 4, - "May": 5, "05": 5, - "Jun": 6, "06": 6, - "Jul": 7, "07": 7, - "Aug": 8, "08": 8, - "Sep": 9, "09": 9, - "Oct": 10, "10": 10, - "Nov": 11, "11": 11, - "Dec": 12, "12": 12, + "Jan": 1, + "01": 1, + "Feb": 2, + "02": 2, + "Mar": 3, + "03": 3, + "Apr": 4, + "04": 4, + "May": 5, + "05": 5, + "Jun": 6, + "06": 6, + "Jul": 7, + "07": 7, + "Aug": 8, + "08": 8, + "Sep": 9, + "09": 9, + "Oct": 10, + "10": 10, + "Nov": 11, + "11": 11, + "Dec": 12, + "12": 12, } # From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ @@ -295,11 +306,10 @@ COUNTRY_NAME_MAP = { "United Kingdom": "gb", "United States": "us", "Uruguay": "uy", - # Additions from running over large files "Bosnia and Herzegovina": "ba", - #"International" - "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn + # "International" + "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn "Russia (Federation)": "ru", "Scotland": "gb", "England": "gb", @@ -320,18 +330,21 @@ class PubmedImporter(EntityImporter): def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of PubMed/MEDLINE XML metadata") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter') - super().__init__(api, + eg_desc = kwargs.get( + "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata" + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.PubmedImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) self.lookup_refs = lookup_refs - self.create_containers = kwargs.get('create_containers', True) + self.create_containers = kwargs.get("create_containers", True) self.read_issn_map_file(issn_map_file) def want(self, obj): @@ -365,15 +378,15 @@ class PubmedImporter(EntityImporter): release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] break if pub_types: - extra_pubmed['pub_types'] = pub_types + extra_pubmed["pub_types"] = pub_types if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): release_type = "retraction" retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") if retraction_of: if retraction_of.RefSource: - extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string + extra_pubmed["retraction_of_raw"] = retraction_of.RefSource.string if retraction_of.PMID: - extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string + extra_pubmed["retraction_of_pmid"] = retraction_of.PMID.string # everything in medline is published release_stage = "published" @@ -388,18 +401,18 @@ class PubmedImporter(EntityImporter): elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"): withdrawn_status = "concern" - pages = medline.find('MedlinePgn') + pages = medline.find("MedlinePgn") if pages: pages = pages.string - title = medline.Article.ArticleTitle.get_text() # always present + title = medline.Article.ArticleTitle.get_text() # always present if title: - title = title.replace('\n', ' ') - if title.endswith('.'): + title = title.replace("\n", " ") + if title.endswith("."): title = title[:-1] # this hides some "special" titles, but the vast majority are # translations; translations don't always include the original_title - if title.startswith('[') and title.endswith(']'): + if title.startswith("[") and title.endswith("]"): title = title[1:-1] else: # will filter out later @@ -408,8 +421,8 @@ class PubmedImporter(EntityImporter): original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: original_title = original_title.get_text() or None - original_title = original_title.replace('\n', ' ') - if original_title and original_title.endswith('.'): + original_title = original_title.replace("\n", " ") + if original_title and original_title.endswith("."): original_title = original_title[:-1] if original_title and not title: @@ -428,7 +441,9 @@ class PubmedImporter(EntityImporter): else: language = LANG_MAP_MARC.get(language) if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC): - warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) + warnings.warn( + "MISSING MARC LANG: {}".format(medline.Article.Language.string) + ) ### Journal/Issue Metadata # MedlineJournalInfo is always present @@ -441,9 +456,9 @@ class PubmedImporter(EntityImporter): country_name = mji.Country.string.strip() country_code = COUNTRY_NAME_MAP.get(country_name) if country_code: - container_extra['country'] = country_code + container_extra["country"] = country_code elif country_name: - container_extra['country_name'] = country_name + container_extra["country_name"] = country_name if mji.find("ISSNLinking"): issnl = mji.ISSNLinking.string @@ -462,7 +477,7 @@ class PubmedImporter(EntityImporter): if issnl: container_id = self.lookup_issnl(issnl) - pub_date = medline.Article.find('ArticleDate') + pub_date = medline.Article.find("ArticleDate") if not pub_date: pub_date = journal.PubDate if not pub_date: @@ -476,7 +491,8 @@ class PubmedImporter(EntityImporter): release_date = datetime.date( release_year, MONTH_ABBR_MAP[pub_date.Month.string], - int(pub_date.Day.string)) + int(pub_date.Day.string), + ) release_date = release_date.isoformat() except ValueError as ve: print("bad date, skipping: {}".format(ve), file=sys.stderr) @@ -486,25 +502,35 @@ class PubmedImporter(EntityImporter): if len(medline_date) >= 4 and medline_date[:4].isdigit(): release_year = int(medline_date[:4]) if release_year < 1300 or release_year > 2040: - print("bad medline year, skipping: {}".format(release_year), file=sys.stderr) + print( + "bad medline year, skipping: {}".format(release_year), file=sys.stderr + ) release_year = None else: - print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) + print( + "unparsable medline date, skipping: {}".format(medline_date), + file=sys.stderr, + ) if journal.find("Title"): container_name = journal.Title.get_text() - if (container_id is None and self.create_containers and (issnl is not None) - and container_name): + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and container_name + ): # name, type, publisher, issnl # extra: original_name, languages, country ce = fatcat_openapi_client.ContainerEntity( name=container_name, - container_type='journal', - #NOTE: publisher not included + container_type="journal", + # NOTE: publisher not included issnl=issnl, issnp=issnp, - extra=(container_extra or None)) + extra=(container_extra or None), + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id @@ -521,8 +547,10 @@ class PubmedImporter(EntityImporter): # "All abstracts are in English" abstracts = [] primary_abstract = medline.find("Abstract") - if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'): - joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")]) + if primary_abstract and primary_abstract.AbstractText.get("NlmCategory"): + joined = "\n".join( + [m.get_text() for m in primary_abstract.find_all("AbstractText")] + ) abst = fatcat_openapi_client.ReleaseAbstract( content=joined, mimetype="text/plain", @@ -539,7 +567,7 @@ class PubmedImporter(EntityImporter): ) if abst.content: abstracts.append(abst) - if abstract.find('math'): + if abstract.find("math"): abst = fatcat_openapi_client.ReleaseAbstract( # strip the <AbstractText> tags content=str(abstract)[14:-15], @@ -551,8 +579,8 @@ class PubmedImporter(EntityImporter): other_abstracts = medline.find_all("OtherAbstract") for other in other_abstracts: lang = "en" - if other.get('Language'): - lang = LANG_MAP_MARC.get(other['Language']) + if other.get("Language"): + lang = LANG_MAP_MARC.get(other["Language"]) abst = fatcat_openapi_client.ReleaseAbstract( content=other.AbstractText.get_text().strip(), mimetype="text/plain", @@ -572,15 +600,15 @@ class PubmedImporter(EntityImporter): surname = None raw_name = None if author.ForeName: - given_name = author.ForeName.get_text().replace('\n', ' ') + given_name = author.ForeName.get_text().replace("\n", " ") if author.LastName: - surname = author.LastName.get_text().replace('\n', ' ') + surname = author.LastName.get_text().replace("\n", " ") if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): - raw_name = author.CollectiveName.get_text().replace('\n', ' ') + raw_name = author.CollectiveName.get_text().replace("\n", " ") contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: @@ -590,7 +618,7 @@ class PubmedImporter(EntityImporter): orcid = orcid.replace("http://orcid.org/", "") elif orcid.startswith("https://orcid.org/"): orcid = orcid.replace("https://orcid.org/", "") - elif '-' not in orcid: + elif "-" not in orcid: orcid = "{}-{}-{}-{}".format( orcid[0:4], orcid[4:8], @@ -598,27 +626,31 @@ class PubmedImporter(EntityImporter): orcid[12:16], ) creator_id = self.lookup_orcid(orcid) - contrib_extra['orcid'] = orcid + contrib_extra["orcid"] = orcid affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: - raw_affiliation = affiliations[0].get_text().replace('\n', ' ') + raw_affiliation = affiliations[0].get_text().replace("\n", " ") if len(affiliations) > 1: - contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]] + contrib_extra["more_affiliations"] = [ + ra.get_text().replace("\n", " ") for ra in affiliations[1:] + ] if author.find("EqualContrib"): # TODO: schema for this? - contrib_extra['equal'] = True - contribs.append(fatcat_openapi_client.ReleaseContrib( - raw_name=raw_name, - given_name=given_name, - surname=surname, - role="author", - raw_affiliation=raw_affiliation, - creator_id=creator_id, - extra=contrib_extra, - )) - - if medline.AuthorList['CompleteYN'] == 'N': + contrib_extra["equal"] = True + contribs.append( + fatcat_openapi_client.ReleaseContrib( + raw_name=raw_name, + given_name=given_name, + surname=surname, + role="author", + raw_affiliation=raw_affiliation, + creator_id=creator_id, + extra=contrib_extra, + ) + ) + + if medline.AuthorList["CompleteYN"] == "N": contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al.")) for i, contrib in enumerate(contribs): @@ -633,7 +665,7 @@ class PubmedImporter(EntityImporter): # note that Reference always exists within a ReferenceList, but # that there may be multiple ReferenceList (eg, sometimes one per # Reference) - for ref in pubmed.find_all('Reference'): + for ref in pubmed.find_all("Reference"): ref_extra = dict() ref_doi = ref.find("ArticleId", IdType="doi") if ref_doi: @@ -643,22 +675,24 @@ class PubmedImporter(EntityImporter): ref_pmid = clean_pmid(ref_pmid.string) ref_release_id = None if ref_doi: - ref_extra['doi'] = ref_doi + ref_extra["doi"] = ref_doi if self.lookup_refs: ref_release_id = self.lookup_doi(ref_doi) if ref_pmid: - ref_extra['pmid'] = ref_pmid + ref_extra["pmid"] = ref_pmid if self.lookup_refs: ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: - ref_extra['unstructured'] = ref_raw.get_text() + ref_extra["unstructured"] = ref_raw.get_text() if not ref_extra: ref_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - target_release_id=ref_release_id, - extra=ref_extra, - )) + refs.append( + fatcat_openapi_client.ReleaseRef( + target_release_id=ref_release_id, + extra=ref_extra, + ) + ) if not refs: refs = None @@ -669,7 +703,7 @@ class PubmedImporter(EntityImporter): # group-title # pubmed: retraction refs if extra_pubmed: - extra['pubmed'] = extra_pubmed + extra["pubmed"] = extra_pubmed if not extra: extra = None @@ -690,14 +724,14 @@ class PubmedImporter(EntityImporter): doi=doi, pmid=pmid, pmcid=pmcid, - #isbn13 # never in Article + # isbn13 # never in Article ), volume=volume, issue=issue, pages=pages, - #publisher # not included? + # publisher # not included? language=language, - #license_slug # not in MEDLINE + # license_slug # not in MEDLINE abstracts=abstracts, contribs=contribs, refs=refs, @@ -725,21 +759,22 @@ class PubmedImporter(EntityImporter): raise err if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format( - existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid) + existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid + ) warnings.warn(warn_str) - self.counts['warn-pmid-doi-mismatch'] += 1 + self.counts["warn-pmid-doi-mismatch"] += 1 # don't clobber DOI, but do group together re.ext_ids.doi = None re.work_id = existing.work_id if existing and not self.do_updates: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if existing and existing.ext_ids.pmid and (existing.refs or not re.refs): # TODO: any other reasons to do an update? # don't update if it already has PMID - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False elif existing: # but do update if only DOI was set @@ -750,12 +785,12 @@ class PubmedImporter(EntityImporter): existing.container_id = existing.container_id or re.container_id existing.refs = existing.refs or re.refs existing.abstracts = existing.abstracts or re.abstracts - existing.extra['pubmed'] = re.extra['pubmed'] + existing.extra["pubmed"] = re.extra["pubmed"] # fix stub titles if existing.title in [ - "OUP accepted manuscript", - ]: + "OUP accepted manuscript", + ]: existing.title = re.title existing.original_title = existing.original_title or re.original_title @@ -770,8 +805,8 @@ class PubmedImporter(EntityImporter): existing.language = existing.language or re.language # update subtitle in-place first - if not existing.subtitle and existing.extra.get('subtitle'): - subtitle = existing.extra.pop('subtitle') + if not existing.subtitle and existing.extra.get("subtitle"): + subtitle = existing.extra.pop("subtitle") if type(subtitle) == list: subtitle = subtitle[0] if subtitle: @@ -781,13 +816,13 @@ class PubmedImporter(EntityImporter): try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: - self.counts['skip-update-conflict'] += 1 + self.counts["skip-update-conflict"] += 1 return False else: raise err @@ -797,11 +832,14 @@ class PubmedImporter(EntityImporter): return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def parse_file(self, handle): @@ -812,8 +850,9 @@ class PubmedImporter(EntityImporter): for article in soup.find_all("PubmedArticle"): resp = self.parse_record(article) print(json.dumps(resp)) - #sys.exit(-1) + # sys.exit(-1) + -if __name__=='__main__': +if __name__ == "__main__": parser = PubmedImporter(None, None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 77205cee..78eeec7a 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from fatcat_tools.normal import clean_doi, clean_isbn13, clean_pmid @@ -30,25 +29,25 @@ class ShadowLibraryImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Import of 'Shadow Library' file/release matches" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ShadowLibraryImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") def want(self, raw_record): """ Only want to import records with complete file-level metadata """ - fm = raw_record['file_meta'] - if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): - self.counts['skip-file-meta-incomplete'] += 1 + fm = raw_record["file_meta"] + if not (fm["mimetype"] and fm["md5hex"] and fm["sha256hex"] and fm["size_bytes"]): + self.counts["skip-file-meta-incomplete"] += 1 return False - if fm['mimetype'] != 'application/pdf': - self.counts['skip-not-pdf'] += 1 + if fm["mimetype"] != "application/pdf": + self.counts["skip-not-pdf"] += 1 return False return True @@ -57,23 +56,23 @@ class ShadowLibraryImporter(EntityImporter): We do the release lookup in this method. Try DOI, then PMID, last ISBN13. """ - shadow_corpus = obj['shadow']['shadow_corpus'] + shadow_corpus = obj["shadow"]["shadow_corpus"] assert shadow_corpus == shadow_corpus.strip().lower() - doi = clean_doi(obj['shadow'].get('doi')) - pmid = clean_pmid(obj['shadow'].get('pmid')) - isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) - shadow_id = obj['shadow'].get('shadow_id').strip() + doi = clean_doi(obj["shadow"].get("doi")) + pmid = clean_pmid(obj["shadow"].get("pmid")) + isbn13 = clean_isbn13(obj["shadow"].get("isbn13")) + shadow_id = obj["shadow"].get("shadow_id").strip() assert shadow_id - extra = { '{}_id'.format(shadow_corpus): shadow_id } - for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + extra = {"{}_id".format(shadow_corpus): shadow_id} + for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]: if not ext_id: continue - extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id + extra["{}_{}".format(shadow_corpus, ext_type)] = ext_id # lookup release via several idents re = None - for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]: if not ext_id: continue try: @@ -86,29 +85,31 @@ class ShadowLibraryImporter(EntityImporter): break if not re: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None - release_ids = [re.ident,] + release_ids = [ + re.ident, + ] # parse single CDX into URLs (if exists) urls = [] - if obj.get('cdx'): - url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) + if obj.get("cdx"): + url = make_rel_url(obj["cdx"]["url"], default_link_rel=self.default_link_rel) if url is not None: urls.append(url) wayback = "https://web.archive.org/web/{}/{}".format( - obj['cdx']['datetime'], - obj['cdx']['url']) + obj["cdx"]["datetime"], obj["cdx"]["url"] + ) urls.append(("webarchive", wayback)) urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] fe = fatcat_openapi_client.FileEntity( - md5=obj['file_meta']['md5hex'], - sha1=obj['file_meta']['sha1hex'], - sha256=obj['file_meta']['sha256hex'], - size=int(obj['file_meta']['size_bytes']), - mimetype=obj['file_meta']['mimetype'] or None, + md5=obj["file_meta"]["md5hex"], + sha1=obj["file_meta"]["sha1hex"], + sha256=obj["file_meta"]["sha256hex"], + size=int(obj["file_meta"]["size_bytes"]), + mimetype=obj["file_meta"]["mimetype"] or None, release_ids=release_ids, urls=urls, extra=dict(shadows=extra), @@ -130,45 +131,50 @@ class ShadowLibraryImporter(EntityImporter): if not existing.extra: existing.extra = {} - if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: + if ( + existing.extra.get("shadows") + and list(fe.extra["shadows"].keys())[0] in existing.extra["shadows"] + ): # already imported from this shadow library; skip - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # check for edit conflicts if existing.ident in [e.ident for e in self._edits_inflight]: - self.counts['skip-update-inflight'] += 1 + self.counts["skip-update-inflight"] += 1 return False if fe.sha1 in [e.sha1 for e in self._edits_inflight]: raise Exception("Inflight insert; shouldn't happen") # minimum viable "existing" URL cleanup to fix dupes and broken links: # remove 'None' wayback URLs, and set archive.org rel 'archive' - existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] + existing.urls = [ + u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url) + ] for i in range(len(existing.urls)): u = existing.urls[i] - if u.rel == 'repository' and '://archive.org/download/' in u.url: - existing.urls[i].rel = 'archive' - if u.rel == 'social': - u.rel = 'academicsocial' + if u.rel == "repository" and "://archive.org/download/" in u.url: + existing.urls[i].rel = "archive" + if u.rel == "social": + u.rel = "academicsocial" # merge the existing into this one and update merged_urls = {} for u in fe.urls + existing.urls: merged_urls[u.url] = u existing.urls = list(merged_urls.values()) - if not existing.extra.get('shadows'): - existing.extra['shadows'] = fe.extra['shadows'] + if not existing.extra.get("shadows"): + existing.extra["shadows"] = fe.extra["shadows"] else: - existing.extra['shadows'].update(fe.extra['shadows']) + existing.extra["shadows"].update(fe.extra["shadows"]) # do these "plus ones" because we really want to do these updates when possible if len(existing.urls) > SANE_MAX_URLS + 1: - self.counts['skip-update-too-many-url'] += 1 + self.counts["skip-update-too-many-url"] += 1 return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES + 1: - self.counts['skip-update-too-many-releases'] += 1 + self.counts["skip-update-too-many-releases"] += 1 return None existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size @@ -180,12 +186,15 @@ class ShadowLibraryImporter(EntityImporter): # group-level de-dupe edit.sha1 = existing.sha1 self._edits_inflight.append(edit) - self.counts['update'] += 1 + self.counts["update"] += 1 return False def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 196f86ff..22fefad3 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -33,22 +33,23 @@ REQ_SESSION = requests.Session() def parse_wbm_url(url): """Takes a wayback machine URL, and returns a tuple: - (timestamp, datetime, original_url) + (timestamp, datetime, original_url) """ - chunks = url.split('/') + chunks = url.split("/") assert len(chunks) >= 6 - assert chunks[2] == 'web.archive.org' - assert chunks[3] == 'web' - return (chunks[4], - parse_wbm_timestamp(chunks[4]), - '/'.join(chunks[5:])) + assert chunks[2] == "web.archive.org" + assert chunks[3] == "web" + return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) + def test_parse_wbm_url(): u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" assert parse_wbm_url(u) == ( "20010712114837", datetime.datetime(2001, 7, 12, 11, 48, 37), - "http://www.dlib.org/dlib/june01/reich/06reich.html") + "http://www.dlib.org/dlib/june01/reich/06reich.html", + ) + def parse_wbm_timestamp(timestamp): """ @@ -56,7 +57,7 @@ def parse_wbm_timestamp(timestamp): python datetime object (UTC) """ # strip any "im_" or "id_" suffix - if timestamp.endswith('_'): + if timestamp.endswith("_"): timestamp = timestamp[:-3] # inflexible; require the full second-precision timestamp assert len(timestamp) == 14 @@ -66,11 +67,13 @@ def parse_wbm_timestamp(timestamp): day=int(timestamp[6:8]), hour=int(timestamp[8:10]), minute=int(timestamp[10:12]), - second=int(timestamp[12:14])) + second=int(timestamp[12:14]), + ) + def test_parse_wbm_timestamp(): - assert parse_wbm_timestamp("20010712114837") == \ - datetime.datetime(2001, 7, 12, 11, 48, 37) + assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) + def fetch_wbm(url): resp = REQ_SESSION.get(url) @@ -78,31 +81,35 @@ def fetch_wbm(url): assert resp.content return resp.content + def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): sys.stderr.write(embed_url + "\n") - assert embed_url.startswith('/web/') - embed_url = embed_url.split('/') + assert embed_url.startswith("/web/") + embed_url = embed_url.split("/") timestamp = embed_url[2] - if timestamp.endswith('_'): + if timestamp.endswith("_"): timestamp = timestamp[:-3] - url = '/'.join(embed_url[3:]) - #print((timestamp, url)) - resp = REQ_SESSION.get(CDX_API_BASE, params=dict( - url=url, - closest=timestamp, - sort="closest", - resolveRevisits="true", - matchType="exact", - limit=1, - )) + url = "/".join(embed_url[3:]) + # print((timestamp, url)) + resp = REQ_SESSION.get( + CDX_API_BASE, + params=dict( + url=url, + closest=timestamp, + sort="closest", + resolveRevisits="true", + matchType="exact", + limit=1, + ), + ) resp.raise_for_status() - #print(resp.url) + # print(resp.url) if resp.content: - hit = resp.content.decode('utf-8').split('\n')[0] + hit = resp.content.decode("utf-8").split("\n")[0] if cdx_output: cdx_output.write(hit + "\n") - cdx = hit.split(' ') - cdx = [x if (x and x != '-') else None for x in cdx] + cdx = hit.split(" ") + cdx = [x if (x and x != "-") else None for x in cdx] webcapture_cdx = WebcaptureCdxLine( surt=cdx[0], timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z", @@ -113,9 +120,9 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): sha256=None, ) if verify_hashes: - resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format( - cdx[1], # raw timestamp - webcapture_cdx.url)) + resp = REQ_SESSION.get( + GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp + ) resp.raise_for_status() assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() @@ -124,47 +131,50 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): else: return None + def wayback_url_to_relative(url): """ Wayback URLs can be relative or absolute in rewritten documents. This function converts any form of rewritten URL to a relative (to web.archive.org) one, or returns None if it isn't a rewritten URL at all. """ - if url.startswith('https://web.archive.org/'): + if url.startswith("https://web.archive.org/"): url = url[23:] - elif url.startswith('http://web.archive.org/'): + elif url.startswith("http://web.archive.org/"): url = url[22:] - if url.startswith('/web/'): + if url.startswith("/web/"): return url else: return None + def extract_embeds(soup): embeds = set() # <link href=""> - for tag in soup.find_all('link', href=True): - if tag['rel'] not in ('stylesheet',): + for tag in soup.find_all("link", href=True): + if tag["rel"] not in ("stylesheet",): continue - url = wayback_url_to_relative(tag['href']) + url = wayback_url_to_relative(tag["href"]) if url: embeds.add(url) # <img src=""> - for tag in soup.find_all('img', src=True): - url = wayback_url_to_relative(tag['src']) + for tag in soup.find_all("img", src=True): + url = wayback_url_to_relative(tag["src"]) if url: embeds.add(url) # <script src=""> - for tag in soup.find_all('script', src=True): - url = wayback_url_to_relative(tag['src']) + for tag in soup.find_all("script", src=True): + url = wayback_url_to_relative(tag["src"]) if url: embeds.add(url) return list(embeds) + def static_wayback_webcapture(wayback_url, cdx_output=None): """ Given a complete wayback machine capture URL, like: @@ -177,36 +187,40 @@ def static_wayback_webcapture(wayback_url, cdx_output=None): wbm_html = fetch_wbm(wayback_url) raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - #with open(rewritten_path, 'r') as fp: + # with open(rewritten_path, 'r') as fp: # soup = BeautifulSoup(fp, "lxml") soup = BeautifulSoup(wbm_html, "lxml") embeds = extract_embeds(soup) - cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url), - cdx_output=cdx_output) + cdx_obj = lookup_cdx( + "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output + ) cdx_list = [cdx_obj] for url in embeds: cdx_obj = lookup_cdx(url, cdx_output=cdx_output) cdx_list.append(cdx_obj) - archive_urls = [WebcaptureUrl( - rel="wayback", - url="https://web.archive.org/web/", - )] + archive_urls = [ + WebcaptureUrl( + rel="wayback", + url="https://web.archive.org/web/", + ) + ] wc = WebcaptureEntity( cdx=cdx_list, timestamp=timestamp.isoformat() + "Z", original_url=original_url, archive_urls=archive_urls, - release_ids=None) + release_ids=None, + ) return wc + def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): """ Returns a tuple: (editgroup_id, edit). If failed, both are None """ raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - git_rev = subprocess.check_output( - ["git", "describe", "--always"]).strip().decode('utf-8') + git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") release = api.get_release(release_id, expand="webcaptures") @@ -214,37 +228,44 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): for wc in release.webcaptures: if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): # skipping: already existed - print("release {} already had webcapture {} {}".format( - release_id, raw_timestamp, original_url)) + print( + "release {} already had webcapture {} {}".format( + release_id, raw_timestamp, original_url + ) + ) return (None, None) wc = static_wayback_webcapture(wayback_url) assert len(wc.cdx) >= 1 wc.release_ids = [release_id] if not editgroup_id: - eg = api.create_editgroup(Editgroup( - description="One-off import of static web content from wayback machine", - extra=dict( - git_rev=git_rev, - agent="fatcat_tools.auto_wayback_static"))) + eg = api.create_editgroup( + Editgroup( + description="One-off import of static web content from wayback machine", + extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"), + ) + ) editgroup_id = eg.editgroup_id edit = api.create_webcapture(eg.editgroup_id, wc) return (editgroup_id, edit) + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--verbose', - action='store_true', - help="verbose output") - parser.add_argument('wayback_url', - type=str, - help="URL of wayback capture to extract from") - parser.add_argument('--json-output', - type=argparse.FileType('w'), default=sys.stdout, - help="where to write out webcapture entity (as JSON)") - parser.add_argument('--cdx-output', - type=argparse.FileType('w'), default=None, - help="(optional) file to write out CDX stub") + parser.add_argument("--verbose", action="store_true", help="verbose output") + parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") + parser.add_argument( + "--json-output", + type=argparse.FileType("w"), + default=sys.stdout, + help="where to write out webcapture entity (as JSON)", + ) + parser.add_argument( + "--cdx-output", + type=argparse.FileType("w"), + default=None, + help="(optional) file to write out CDX stub", + ) args = parser.parse_args() @@ -254,5 +275,6 @@ def main(): wc_dict = api_client.sanitize_for_serialization(wc) print(json.dumps(wc_dict)) -if __name__ == '__main__': + +if __name__ == "__main__": main() |