diff options
87 files changed, 5117 insertions, 1252 deletions
@@ -1,32 +1,24 @@ ## In Progress -- basic python tests for editgroup, annotation, submission changes -- python tests for new autoaccept behavior -- python tests for citation table storage efficiency changes - => should there be a distinction between empty list and no references? - yes, eg if expanded or not hidden - => postgres manual checks that this is working - => also benchmark (both speed and efficiency) +- check that any needed/new indices are in place + => seems to at least superficially work +- benchmark citation efficiency (in QA) + +- all query params need to be strings, and parse in rust :( + since=(datetime.datetime.utcnow() + datetime.timedelta(seconds=1)).isoformat()+"Z" +- doc: python client API needs to have booleans set as, eg, 'true'/'false' (str) (!?!?) + "note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response" ## Next Up - "don't clobber" mode/flag for crossref import (and others?) -- update_file requires 'id'. should it be 'ident'? - => something different about file vs. release -- guide updates for auth -- refactor webface views to use shared entity_view.html template -- handle 'wip' status entities in web UI - elastic inserter should handle deletions and redirects; if state isn't active, delete the document => don't delete, just store state. but need to "blank" redirects and WIP so they don't show up in results => refactor inserter to be a class (eg, for command line use) => end-to-end test of this behavior? -- date handling is really pretty bad for releases; mangling those Jan1/Dec31 - => elastic schema should have a year field (integer) -- document: elastic query date syntax is like: date:[2018-10-01 TO 2018-12-31] -- elastic transform should only include authors, not editors (?) - webcapture timestamp schema cleanup (both CDX and base) => dt.to_rfc3339_opts(SecondsFormat::Secs, true) => but this is mostly buried in serialization code? @@ -43,6 +35,9 @@ - handle very large author/reference lists (instead of dropping) => https://api.crossref.org/v1/works/http://dx.doi.org/10.1007/978-3-319-46095-6_7 => 7000+ authors (!) +- guide updates for auth +- refactor webface views to use shared entity_view.html template +- handle 'wip' status entities in web UI ## Bugs (or at least need tests) @@ -151,6 +146,7 @@ new importers: ## Schema / Entity Fields +- elastic transform should only include authors, not editors (?) - arxiv_id field (keep flip-flopping) - original_title field (internationalization, "original language") - `doi` field for containers (at least for "journal" type; maybe for "series" @@ -162,6 +158,7 @@ new importers: ## Other / Backburner +- document: elastic query date syntax is like: date:[2018-10-01 TO 2018-12-31] - fileset/webcapture webface anything - display abstracts better. no hashes or metadata; prefer plain or HTML, convert JATS if necessary diff --git a/extra/bootstrap_bots.sh b/extra/bootstrap_bots.sh new file mode 100755 index 00000000..7be148b2 --- /dev/null +++ b/extra/bootstrap_bots.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Run this script from the ../rust/ directory, only once. + +CMD_PATH="./target/debug" + +$CMD_PATH/fatcat-auth create-editor --admin --bot crossref-bot > /dev/null +$CMD_PATH/fatcat-auth create-editor --admin --bot pubmed-bot > /dev/null +$CMD_PATH/fatcat-auth create-editor --admin --bot datacite-bot > /dev/null +$CMD_PATH/fatcat-auth create-editor --admin --bot orcid-bot > /dev/null +$CMD_PATH/fatcat-auth create-editor --admin --bot journal-metadata-bot > /dev/null +$CMD_PATH/fatcat-auth create-editor --admin --bot sandcrawler-bot > /dev/null + +echo -n "FATCAT_AUTH_WORKER_CROSSREF=" +$CMD_PATH/fatcat-auth create-token `$CMD_PATH/fatcat-auth list-editors | grep crossref-bot | cut -f1` +echo -n "FATCAT_AUTH_WORKER_PUBMED=" +$CMD_PATH/fatcat-auth create-token `$CMD_PATH/fatcat-auth list-editors | grep pubmed-bot | cut -f1` +echo -n "FATCAT_AUTH_WORKER_DATACITE=" +$CMD_PATH/fatcat-auth create-token `$CMD_PATH/fatcat-auth list-editors | grep datacite-bot | cut -f1` +echo -n "FATCAT_AUTH_WORKER_ORCID=" +$CMD_PATH/fatcat-auth create-token `$CMD_PATH/fatcat-auth list-editors | grep orcid-bot | cut -f1` +echo -n "FATCAT_AUTH_WORKER_JOURNAL_METADATA=" +$CMD_PATH/fatcat-auth create-token `$CMD_PATH/fatcat-auth list-editors | grep journal-metadata-bot | cut -f1` +echo -n "FATCAT_AUTH_SANDCRAWLER=" +$CMD_PATH/fatcat-auth create-token `$CMD_PATH/fatcat-auth list-editors | grep sandcrawler-bot | cut -f1` diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json new file mode 100644 index 00000000..7a7ec90c --- /dev/null +++ b/extra/elasticsearch/changelog_schema.json @@ -0,0 +1,39 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + } + } + } + } +}, +"mappings": { + "changelog": { + "properties": { + "index": { "type": "integer" }, + "editgorup_id": { "type": "keyword" }, + "timestamp": { "type": "datetime" }, + "username": { "type": "keyword" }, + "is_bot": { "type": "boolean" }, + "is_admin": { "type": "boolean" }, + "agent": { "type": "keyword" }, + "containers": { "type": "integer" }, + "creators": { "type": "integer" }, + "files": { "type": "integer" }, + "filessets": { "type": "integer" }, + "webcaptures": { "type": "integer" }, + "releases": { "type": "integer" }, + "works": { "type": "integer" }, + "created": { "type": "integer" }, + "updated": { "type": "integer" }, + "deleted": { "type": "integer" }, + "total": { "type": "integer" } + } + } +} +} diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json new file mode 100644 index 00000000..83791ab8 --- /dev/null +++ b/extra/elasticsearch/container_schema.json @@ -0,0 +1,74 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + }, + "textIcu": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + }, + "textIcuSearch": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + } + } + } + } +}, +"mappings": { + "container": { + "properties": { + "ident": { "type": "keyword" }, + "state": { "type": "keyword" }, + "revision": { "type": "keyword" }, + "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "container_type": { "type": "keyword" }, + "wikidata_qid": { "type": "keyword" }, + "issnl": { "type": "keyword" }, + "region": { "type": "keyword" }, + "nation": { "type": "keyword" }, + "discipline": { "type": "keyword" }, + "languages": { "type": "keyword" }, + "mimetypes": { "type": "keyword" }, + "first_year": { "type": "integer" }, + "last_year": { "type": "integer" }, + + "in_doaj": { "type": "boolean" }, + "in_road": { "type": "boolean" }, + "in_doi": { "type": "boolean" }, + "in_doaj_works": { "type": "boolean" }, + "in_sherpa_romeo":{ "type": "boolean" }, + "is_oa": { "type": "boolean" }, + "is_longtail_oa": { "type": "boolean" }, + "any_kbart": { "type": "boolean" }, + "any_jstor": { "type": "boolean" }, + "any_sim": { "type": "boolean" }, + "ia_homepage_status": { "type": "boolean" }, + + "releases_total": { "type": "integer" }, + "releases_kbart": { "type": "integer" }, + "releases_ia": { "type": "integer" }, + "releases_sim": { "type": "integer" }, + "releases_shadow": { "type": "integer" }, + "releases_any_file": { "type": "integer" }, + "releases_any_fileset": { "type": "integer" }, + "releases_any_webcapture": { "type": "integer" }, + + "year": { "type": "alias", "path": "first_year" }, + "type": { "type": "alias", "path": "container_type" }, + "oa": { "type": "alias", "path": "is_oa" }, + "longtail": { "type": "alias", "path": "is_longtail_oa" } + } + } +} +} diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index c9b77301..4cfe0894 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -28,41 +28,59 @@ "release": { "properties": { "ident": { "type": "keyword" }, + "state": { "type": "keyword" }, "revision": { "type": "keyword" }, "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "author": { "type": "alias", "path": "contrib_names" }, - "journal": { "type": "alias", "path": "container_name" }, - "date": { "type": "alias", "path": "release_date" }, - "year": { "type": "alias", "path": "release_year" }, - "issn": { "type": "alias", "path": "container_issnl" }, - "oa": { "type": "alias", "path": "container_is_oa" }, - "longtail": { "type": "alias", "path": "container_is_longtail_oa" }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "release_date": { "type": "date" }, "release_year": { "type": "integer" }, "release_type": { "type": "keyword" }, "release_status": { "type": "keyword" }, - "language": { "type": "keyword" }, - "doi": { "type": "keyword" }, - "pmid": { "type": "keyword" }, - "pmcid": { "type": "keyword" }, - "isbn13": { "type": "keyword" }, - "core_id": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, + "language": { "type": "keyword" }, + "doi": { "type": "keyword" }, + "pmid": { "type": "keyword" }, + "pmcid": { "type": "keyword" }, + "isbn13": { "type": "keyword" }, + "wikidata_qid": { "type": "keyword" }, + "core_id": { "type": "keyword" }, + "axiv_id": { "type": "keyword" }, + "jstor_id": { "type": "keyword" }, + "license": { "type": "keyword" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "container_issnl": { "type": "keyword" }, - "container_is_oa": { "type": "boolean" }, - "container_is_longtail_oa": { "type": "boolean" }, + "container_type": { "type": "keyword" }, "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "ref_count": { "type": "integer" }, "file_count": { "type": "integer" }, - "file_pdf_url": { "type": "keyword" }, - "file_in_webarchive": { "type": "boolean" }, - "file_in_ia": { "type": "boolean" }, + "fileset_count": { "type": "integer" }, + "webcapture_count": { "type": "integer" }, "any_abstract": { "type": "boolean" }, - "is_kept": { "type": "boolean" }, - "in_shadow": { "type": "boolean" } + + "best_pdf_url": { "type": "keyword" }, + "ia_pdf_url": { "type": "keyword" }, + "is_oa": { "type": "boolean" }, + "is_longtail_oa": { "type": "boolean" }, + "is_preserved": { "type": "boolean" }, + "in_kbart": { "type": "boolean" }, + "in_jstor": { "type": "boolean" }, + "in_dweb": { "type": "boolean" }, + "in_web": { "type": "boolean" }, + "in_ia": { "type": "boolean" }, + "in_sim": { "type": "boolean" }, + "in_shadows": { "type": "boolean" }, + + "author": { "type": "alias", "path": "contrib_names" }, + "journal": { "type": "alias", "path": "container_name" }, + "date": { "type": "alias", "path": "release_date" }, + "year": { "type": "alias", "path": "release_year" }, + "issn": { "type": "alias", "path": "container_issnl" }, + "oa": { "type": "alias", "path": "is_oa" }, + "longtail": { "type": "alias", "path": "is_longtail_oa" }, + "lang": { "type": "alias", "path": "language" }, + "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, + "is_kept": { "type": "alias", "path": "in_kbart" } } } } diff --git a/extra/journal_metadata/.gitignore b/extra/journal_metadata/.gitignore new file mode 100644 index 00000000..15dbcfda --- /dev/null +++ b/extra/journal_metadata/.gitignore @@ -0,0 +1,2 @@ +*.json +*.json.gz diff --git a/extra/journal_metadata/README.md b/extra/journal_metadata/README.md new file mode 100644 index 00000000..61dbc6b0 --- /dev/null +++ b/extra/journal_metadata/README.md @@ -0,0 +1,71 @@ + +This folder contains scripts to merge journal metadat from multiple sources and +provide a snapshot for bulk importing into fatcat. + +Specific bots will probably be needed to do continous updates; that's out of +scope for this first import. + + +## Sources + +The `./data/fetch.sh` script will fetch mirrored snapshots of all these +datasets. + +A few sources of normalization/mappings: + +- ISSN-L (from ISSN org) + - Original: + - Snapshot: <https://archive.org/download/issn_issnl_mappings/20180216.ISSN-to-ISSN-L.txt> +- ISO 639-1 language codes: https://datahub.io/core/language-codes +- ISO 3166-1 alpha-2 nation codes + +In order of precedence (first higher than later): + +- NCBI Entrez (Pubmed) + - Original: <ftp://ftp.ncbi.nlm.nih.gov/pubmed/J_Entrez.txt> + - Snapshot: <https://archive.org/download/ncbi-entrez-2019/J_Entrez.txt> +- DOAJ + - Original: <https://doaj.org/csv> + - Snapshot: <https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv> +- ROAD + - Original: <http://road.issn.org/en/contenu/download-road-records> + - Snapshot: <https://archive.org/download/road-issn-2018/2018-01-24/export-issn.zip> +- SHERPA/ROMEO + - Original: <http://www.sherpa.ac.uk/downloads/journal-title-issn-urls.php> (requires reg) + - Mirror: <http://www.moreo.info/?csv=romeo-journals.csv> + - Snapshot: +- Norwegian Registry + - Original: <https://dbh.nsd.uib.no/publiseringskanaler/AlltidFerskListe> + - Snapshot: <https://archive.org/download/norwegian_register_journals> +- Wikidata (TODO: Journal-level not title-level) + - Original: <http://uri.gbv.de/wikicite/20180903/> + - Snapshot: <https://archive.org/download/wikicite-biblio-data-20180903> +- KBART reports: LOCKSS, CLOCKSS, Portico + - Original: (multiple, see README in IA item) + - Snapshot: <https://archive.org/download/keepers_reports_201901> +- JSTOR + - Original: <https://support.jstor.org/hc/en-us/articles/115007466248-JSTOR-title-lists> + - Snapshot: <KBART jstor_all-archive-titles.txt> +- Crossref title list (not DOIs) + - Original: <https://wwwold.crossref.org/titlelist/titleFile.csv> + - Snapshot: <https://archive.org/download/crossref_doi_titles> +- IA SIM Microfilm catalog + - Original: <https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.xlsx> +- IA homepage crawl attempts + +The SHERPA/ROMEO content comes from the list helpfully munged by moreo.info. + +General form here is to build a huge python dict in memory, keyed by the +ISSN-L, then write out to disk as JSON. Then the journal-metadata importer +takes a subset of fields and inserts to fatcat. Lastly, the elasticsearch +transformer takes a subset/combination of + +## Python Helpers/Libraries + +- ftfy +- pycountry + +Debian: + + sudo apt install python3-pycountry + sudo pip3 install ftfy diff --git a/extra/journal_metadata/data/.gitignore b/extra/journal_metadata/data/.gitignore new file mode 100644 index 00000000..c3f104a6 --- /dev/null +++ b/extra/journal_metadata/data/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!fetch.sh diff --git a/extra/journal_metadata/data/fetch.sh b/extra/journal_metadata/data/fetch.sh new file mode 100755 index 00000000..b087d864 --- /dev/null +++ b/extra/journal_metadata/data/fetch.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -eu + +#wget -c https://archive.org/download/road-issn-2018/2018-01-24/export-issn.zip -O road-2018-01-24-export-issn.zip +#unzip -n road-2018-01-24-export-issn.zip +wget -c https://archive.org/download/road-issn-2018/road-2018-01-24.tsv + +wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv + +wget -c https://archive.org/download/issn_issnl_mappings/20181203.ISSN-to-ISSN-L.txt + +wget -c https://archive.org/download/crossref_doi_titles/doi_titles_file_2019-01-24.csv + +#wget -c https://archive.org/download/ncbi-entrez-2019/J_Entrez.txt -O ncbi-entrez-2019.txt + +wget -c https://archive.org/download/moreo.info-2018-12-20/romeo-journals.csv +wget -c https://archive.org/download/moreo.info-2018-12-20/romeo-policies.csv +wget -c https://archive.org/download/moreo.info-2018-12-20/entrez-journals.csv + +wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv + +wget -c https://archive.org/download/keepers_reports_201901/jstor_all-archive-titles.txt +wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_AllCurrentJournalTitles_2019-01-07.txt +wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_EarlyJournalContent_2017-06-08.txt +wget -c https://archive.org/download/keepers_reports_201901/kbart_CLOCKSS.txt +wget -c https://archive.org/download/keepers_reports_201901/kbart_LOCKSS.txt +wget -c https://archive.org/download/keepers_reports_201901/Portico_Holding_KBart.txt + +wget -c https://archive.org/download/ia_journal_metadata_explore_2018-04-05/journal_homepage_results.partial.tsv + +#wget -c https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.xlsx +wget -c https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.converted.csv + +wget -c https://archive.org/download/norwegian_register_journals/2018-03-02%20Norwegian%20Register%20for%20Scientific%20Journals%20and%20Series.csv + +#wget -c https://archive.org/download/szczepanski-oa-journal-list-2018/Jan-Szczepanski-Open-Access-Journals-2018_0.docx +#wget -c https://archive.org/download/szczepanski-oa-journal-list-2018/Jan-Szczepanski-Open-Access-Journals-2018_0.converted.csv diff --git a/extra/journal_metadata/parse_merge_metadata.py b/extra/journal_metadata/parse_merge_metadata.py new file mode 100755 index 00000000..c9909a8e --- /dev/null +++ b/extra/journal_metadata/parse_merge_metadata.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python3 + +import sys, csv, json +import ftfy +import pycountry + +ISSNL_FILE = 'data/20181203.ISSN-to-ISSN-L.txt' + +ENTREZ_FILE = 'data/entrez-journals.csv' +ROAD_FILE = 'data/road-2018-01-24.tsv' +ROAD_DATE = '2018-01-24' +DOAJ_FILE = 'data/doaj_20190124.csv' +DOAJ_DATE = '2019-01-24' +CROSSREF_FILE = 'data/doi_titles_file_2019-01-24.csv' +SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv' +SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv' +NORWEGIAN_FILE = 'data/2018-03-02 Norwegian Register for Scientific Journals and Series.csv' +NORWEGIAN_DATE = '2018-03-02' +LOCKSS_FILE = 'data/kbart_LOCKSS.txt' +CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt' +PORTICO_FILE = 'data/Portico_Holding_KBart.txt' +JSTOR_FILE = 'data/jstor_all-archive-titles.txt' +SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv' +IA_CRAWL_FILE = 'data/journal_homepage_results.partial.tsv' + + +class Munger(): + """ + Top-level fields we'd like to fill in if possible: + + issnp: string + issne: string + first_year: year (integer) + last_year: if publishing has stopped + languages: array of ISO codes; first is the "primary" language + nation: ISO shortcode of nation published from + url: homepage + abbrev: string + default_license: slug + original_name: native name (if name is translated) + platform: hosting platform: OJS, wordpress, scielo, etc + mimetypes: array of strings (eg, 'application/pdf', 'text/html') + aliases: array of "also known as" + + Lower priority (TODO/later): + coden: string + oclc_id: string (lookup?) + lccn_id: string (lookup?) + dblb_id: string + region: TODO: continent/world-region + discipline: TODO: highest-level subject; "life science", "humanities", etc + field: TODO: narrower description of field + subjects: TODO? + + TODO: more ftfy? + TODO: remove surrounding quotes + TODO: null ISSN-L? + TODO: sherpa OA: 'Paid OA options' or 'All journals OA' + TODO: mailto: in urls + TODO: empty gaps (sim) + """ + + def __init__(self): + self.data = dict() + with open(ISSNL_FILE, 'r') as f: + self.read_issn_map_file(f) + + def run(self, out_path): + self.load_road(ROAD_FILE) + self.load_doaj(DOAJ_FILE) + self.load_crossref(CROSSREF_FILE) + self.load_norwegian(NORWEGIAN_FILE) + self.load_sherpa_romeo(SHERPA_ROMEO_JOURNAL_FILE, SHERPA_ROMEO_POLICY_FILE) + self.load_kbart('lockss', LOCKSS_FILE) + self.load_kbart('clockss', CLOCKSS_FILE) + self.load_kbart('portico', PORTICO_FILE) + self.load_kbart('jstor', JSTOR_FILE) + self.load_entrez(ENTREZ_FILE) + self.load_sim(SIM_FILE) + self.load_homepage_crawl(IA_CRAWL_FILE) + self.summarize() + self.dump(out_path) + print("Done!") + + def dump(self, out_path): + print("#### Dumping to {}".format(out_path)) + with open(out_path, 'w') as out: + for issnl in self.data: + out.write(json.dumps(self.data[issnl]) + "\n") + + def summarize(self): + print("##### Loaded {} unique entries".format(len(self.data))) + + def read_issn_map_file(self, issn_map_file): + print("##### Loading ISSN map file...") + self._issn_issnl_map = dict() + for line in issn_map_file: + if line.startswith("ISSN") or len(line) == 0: + continue + (issn, issnl) = line.split()[0:2] + self._issn_issnl_map[issn] = issnl + # double mapping makes lookups easy + self._issn_issnl_map[issnl] = issnl + print("Got {} ISSN-L mappings.".format(len(self._issn_issnl_map))) + + def issn2issnl(self, issn): + if issn is None: + return None + return self._issn_issnl_map.get(issn) + + def add_issn(self, raw_issn=None, issne=None, issnp=None, name=None, publisher=None): + # do ISSN => ISSN-L mappings for any raw ISSNs + lookup = raw_issn or issne or issnp + lookup = lookup.strip() + if not (len(lookup) == 9 and lookup[4] == '-'): + print(lookup) + print(len(lookup)) + print(lookup[4]) + assert len(lookup) == 9 and lookup[4] == '-' + issnl = self.issn2issnl(lookup.upper()) + # lookup ISSN-Ls in data (or create one) + if not issnl in self.data: + self.data[issnl] = dict(issnl=issnl) + d = self.data[issnl] + # if name/publisher not set, do so + if name and not 'name' in d: + self.data[issnl]['name'] = ftfy.fix_text(name).strip() + if publisher and not 'publisher' in d: + self.data[issnl]['publisher'] = ftfy.fix_text(publisher).strip() + if issne and not 'issne' in d: + self.data[issnl]['issne'] = issne + if issnp and not 'issnp' in d: + self.data[issnl]['issnp'] = issnp + # always return ISSN-L + return issnl + + def load_entrez(self, path): + print("##### Loading Entrez...") + # JrId,JournalTitle,MedAbbr,"ISSN (Print)","ISSN (Online)",IsoAbbr,NlmId + reader = csv.DictReader(open(path)) + skipped = 0 + count = 0 + for row in reader: + if not (row.get('ISSN (Online)') or row.get('ISSN (Print)')): + skipped += 1 + continue + issnl = self.add_issn( + issne=row.get('ISSN (Online)'), + issnp=row.get('ISSN (Print)'), + name=row['JournalTitle'], + ) + count += 1 + print("Matched {}".format(count)) + print("Skipped {} for not having ISSNs".format(skipped)) + + def load_road(self, path): + print("##### Loading ROAD...") + reader = csv.DictReader(open(path), delimiter='\t', + fieldnames=("ISSN", "ISSN-L", "Short Title", "Title", "Publisher", "URL1", "URL2", "Region", "Lang1", "Lang2") + ) + count = 0 + for row in reader: + issnl = self.add_issn( + raw_issn=row['ISSN-L'], + name=row['Short Title'], + publisher=row['Publisher'], + ) + count += 1 + d = self.data[issnl] + if row['URL1'] and not 'url' in d: + self.data[issnl]['url'] = row['URL1'] + # TODO: region mapping: "Europe and North America" + # TODO: lang mapping: already alpha-3 + self.data[issnl]['road'] = dict(as_of=ROAD_DATE) + print("Matched {}".format(count)) + + def load_doaj(self, path): + print("##### Loading DOAJ...") + #Journal title Journal URL Alternative title ISSN-print ISSN-electronic Publisher Society or institution Platform, host or aggregator Country of publisher Journal article processing charges (APCs) ... Deposit policy directory Author holds copyright without restrictions Copyright information URL Author holds publishing rights without restrictions Publishing rights information URL DOAJ Seal Tick: Accepted after March 2014 Added on Date Subjects ISSN-L + reader = csv.DictReader(open(path)) + count = 0 + for row in reader: + issnl = self.add_issn( + issnp=row['Journal ISSN (print version)'], + issne=row['Journal EISSN (online version)'], + name=row['Journal title'], + publisher=row['Publisher'], + ) + count += 1 + d = self.data[issnl] + doaj = dict(as_of=DOAJ_DATE) + # TODO: work_level: bool (are work-level publications deposited with DOAJ?) + # TODO: archiving: array, can include 'library' or 'other' + + if row['Platform, host or aggregator']: + # TODO: mapping here? + self.data[issnl]['platform'] = row['Platform, host or aggregator'] + if row['DOAJ Seal']: + doaj['seal'] = {"no": False, "yes": True}[row['DOAJ Seal'].lower()] + if row['Country of publisher']: + # TODO: country mapping + self.data[issnl]['country'] = row['Country of publisher'] + # TODO: Subjects + self.data[issnl]['doaj'] = doaj + print("Matched {}".format(count)) + + def load_sherpa_romeo(self, journal_path, policy_path): + # first load policies + print("##### Loading SHERPA/ROMEO policies...") + #RoMEO Record ID,Publisher,Policy Heading,Country,RoMEO colour,Published Permission,Published Restrictions,Published Max embargo,Accepted Prmission,Accepted Restrictions,Accepted Max embargo,Submitted Permission,Submitted Restrictions,Submitted Max embargo,Open Access Publishing,Record Status,Updated + policies = dict() + fixed_policy_file = ftfy.fix_file(open(policy_path, 'rb')) + policy_reader = csv.DictReader(fixed_policy_file) + for row in policy_reader: + policies[row['RoMEO Record ID']] = row + print("##### Loading SHERPA/ROMEO journal metadata...") + #Journal Title,ISSN,ESSN,URL,RoMEO Record ID,Updated + # super mangled :( + raw_file = open(journal_path, 'rb').read().decode(errors='replace') + fixed_file = ftfy.fix_text(raw_file) + reader = csv.DictReader(fixed_file.split('\n')) + count = 0 + for row in reader: + #row['Journal Title'] = row.pop('\ufeffJournal Title') + row.update(policies[row['RoMEO Record ID']]) + issnl = self.add_issn( + issnp=row['ISSN'], + issne=row['ESSN'], + name=row['Journal Title'], + publisher=row['Publisher'], + ) + count += 1 + d = self.data[issnl] + sherpa_romeo = dict() + if row['RoMEO colour']: + sherpa_romeo['color'] = row['RoMEO colour'] + if row['Open Access Publishing']: + # TODO: boolean? + sherpa_romeo['oa'] = row['Open Access Publishing'] + if row['Country'] and not 'country' in d: + self.data[issnl]['country'] = row['Country'].lower() + self.data[issnl]['sherpa_romeo'] = sherpa_romeo + print("Matched {}".format(count)) + + def load_norwegian(self, path): + print("##### Loading Norwegian Registry...") + #pandas.read_csv(NORWEGIAN_FILE, sep=';', encoding="ISO-8859-1") + #NSD tidsskrift_id;Original title;International title;Present Level (2018);Print ISSN;Online ISSN;Open Access;NPI Scientific Field;NPI Academic Discipline;URL;Publishing Company;Publisher;Country of publication;Language;Level 2019;Level 2018;Level 2017;Level 2016;Level 2015;Level 2014;Level 2013;Level 2012;Level 2011;Level 2010;Level 2009;Level 2008;Level 2007;Level 2006;Level 2005;Level 2004;itar_id + reader = csv.DictReader(open(path, encoding="ISO-8859-1"), delimiter=";") + count = 0 + skip = 0 + for row in reader: + issnp = row['Print ISSN'] + issne = row['Online ISSN'] + if issne and len(issne.strip()) != 9: + issne = None + if issnp and len(issnp.strip()) != 9: + issnp = None + if not (issnp or issne): + skip += 1 + continue + issnl = self.add_issn( + issnp=issnp, + issne=issne, + name=row['International title'], + publisher=row['Publisher'], + ) + count += 1 + d = self.data[issnl] + norwegian = dict(as_of=NORWEGIAN_DATE) + norwegian['level'] = int(row['Present Level (2018)']) + norwegian['id'] = int(row['NSD tidsskrift_id']) + + if row['Original title'] != row['International title'] and not 'original_name' in d: + self.data[issnl]['original_name'] = row['Original title'] + if row['Country of publication'] and not 'country' in d: + # TODO: country mapping + self.data[issnl]['country'] = row['Country of publication'] + if row['Language'] and not 'language' in d: + # TODO: language mapping + self.data[issnl]['language'] = row['Language'] + self.data[issnl]['norwegian'] = norwegian + print("Skipped {} for mangled ISSN".format(skip)) + print("Matched {}".format(count)) + + def load_kbart(self, name, path): + print("##### Loading KBART file for {}...".format(name)) + #publication_title print_identifier online_identifier date_first_issue_online num_first_vol_online num_first_issue_online date_last_issue_online num_last_vol_online num_last_issue_online title_url first_author title_id embargo_info coverage_depth coverage_notes publisher_name + raw_file = open(path, 'rb').read().decode(errors='replace') + fixed_file = ftfy.fix_text(raw_file) + reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t') + count = 0 + skip = 0 + for row in reader: + if not row['print_identifier'] and not row['online_identifier']: + skip += 1 + continue + issnl = self.add_issn( + issnp=row['print_identifier'], + issne=row['online_identifier'], + name=row['publication_title'], + publisher=row['publisher_name'], + ) + count += 1 + d = self.data[issnl] + if not 'kbart' in d: + self.data[issnl]['kbart'] = dict() + kbart = dict() + if row['date_first_issue_online'] and row['date_last_issue_online']: + kbart['year_span'] = [[int(row['date_first_issue_online'][:4]), int(row['date_last_issue_online'][:4])]] + self.data[issnl]['kbart'][name] = kbart + print("Skipped {} missing ISSN".format(skip)) + print("Matched {}".format(count)) + + def load_crossref(self, path): + print("##### Loading Crossref...") + #"JournalTitle","JournalID","Publisher","pissn","eissn","additionalIssns","doi","(year1)[volume1]issue1,issue2,issue3(year2)[volume2]issue4,issues5" + reader = csv.DictReader(open(path)) + count = 0 + skip = 0 + for row in reader: + if row['pissn'] and len(row['pissn']) == 8: + row['pissn'] = row['pissn'][:4] + '-' + row['pissn'][4:] + if row['eissn'] and len(row['eissn']) == 8: + row['eissn'] = row['eissn'][:4] + '-' + row['eissn'][4:] + if not (row['pissn'] or row['eissn']): + skip += 1 + continue + issnl = self.add_issn( + issnp=row['pissn'], + issne=row['eissn'], + name=row['JournalTitle'], + publisher=row['Publisher'], + ) + count += 1 + d = self.data[issnl] + crossref = dict() + if row['doi']: + crossref['doi'] = row['doi'] + self.data[issnl]['crossref'] = crossref + print("Skipped {} missing ISSN".format(skip)) + print("Matched {}".format(count)) + + def load_sim(self, path): + print("##### Loading SIM Metadata...") + #NA Pub Cat ID,Title,Publisher,ISSN,Impact Rank,Total Cities,Journal Impact Factor,Eigenfact or Score,First Volume,Last Volume,NA Gaps,"Scholarly / Peer-\n Reviewed","Peer-\n Reviewed",Pub Type,Pub Language,Subjects + reader = csv.DictReader(open(path)) + count = 0 + skip = 0 + for row in reader: + if not row['ISSN'] or row['ISSN'] == "NULL": + skip += 1 + continue + issnl = self.add_issn( + raw_issn=row['ISSN'][:9], + name=row['Title'], + publisher=row['Publisher'], + ) + count += 1 + d = self.data[issnl] + sim = dict() + sim['id'] = row['NA Pub Cat ID'] + sim['first_year'] = row['First Volume'] + sim['last_year'] = row['Last Volume'] + sim['gaps'] = row['NA Gaps'] + # TODO: 'Pub Language' + # TODO: 'Pub Type' + self.data[issnl]['sim'] = sim + print("Skipped {} missing ISSN".format(skip)) + print("Matched {}".format(count)) + + def load_homepage_crawl(self, path): + print("##### Loading IA Homepage Crawl Results...") + reader = csv.DictReader(open(path), delimiter='\t', + fieldnames=("ISSN", "first_url", "first_status", "last_status", "last_url") + ) + count = 0 + skip = 0 + for row in reader: + issnl = self.add_issn( + raw_issn=row['ISSN'], + ) + count += 1 + d = self.data[issnl] + ia = d.get('ia', dict()) + ia['homepage_status'] = int(row['last_status']) + if ia['homepage_status'] == 200: + ia['homepage_url'] = row['last_url'] + else: + ia['homepage_url'] = row['first_url'] + self.data[issnl]['ia'] = ia + print("Skipped {} missing ISSN".format(skip)) + print("Matched {}".format(count)) + +if __name__=='__main__': + munger = Munger() + munger.run(sys.argv[1]) + diff --git a/fatcat-openapi2.yml b/fatcat-openapi2.yml index 140fbde3..e359de36 100644 --- a/fatcat-openapi2.yml +++ b/fatcat-openapi2.yml @@ -26,6 +26,10 @@ tags: # TAGLINE descriptions: "Creator entities: such as authors" # TAGLINE - name: files # TAGLINE descriptions: "File entities" # TAGLINE + - name: filesets # TAGLINE + descriptions: "Fileset entities" # TAGLINE + - name: webcaptures # TAGLINE + descriptions: "Webcapture entities" # TAGLINE - name: releases # TAGLINE descriptions: "Release entities: individual articles, pre-prints, books" # TAGLINE - name: works # TAGLINE @@ -132,6 +136,9 @@ definitions: type: string example: "Journal of Important Results" description: "Required for valid entities" + container_type: + type: string + description: "Eg, 'journal'" publisher: type: string example: "Society of Curious Students" @@ -139,10 +146,6 @@ definitions: <<: *FATCATISSN wikidata_qid: type: string - abbrev: - type: string - coden: - type: string creator_entity: type: object # required for creation: display_name @@ -249,7 +252,7 @@ definitions: properties: <<: *ENTITYPROPS cdx: - # limit of 200 CDX lines, at least to start + # limit of 200 CDX lines, at least to start? type: array items: type: object @@ -264,7 +267,9 @@ definitions: example: "org,asheesh)/apus/ch1/node15.html" timestamp: type: string - example: "20020429162520" + format: date-time + example: "2016-09-19T17:20:24Z" + description: "UTC, 'Z'-terminated, second (or better) precision" url: type: string # NOTE: not format:url to allow alternatives @@ -302,6 +307,7 @@ definitions: timestamp: type: string format: date-time + description: "same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Can be the earliest or average of CDX timestamps if that makes sense." release_ids: type: array items: @@ -313,7 +319,10 @@ definitions: <<: *ENTITYPROPS title: type: string - description: "Required for valid entities" + description: "Required for valid entities. The title used in citations and for display; usually English" + original_title: + type: string + description: "Title in original language (or, the language of the full text of this release)" work_id: type: string example: "q3nouwy3nnbsvo3h5klxsx4a7y" @@ -343,7 +352,7 @@ definitions: example: "book" release_status: type: string - example: "preprint" + example: "preprint, retracted" release_date: type: string format: date @@ -367,6 +376,10 @@ definitions: core_id: type: string #format: custom + arxiv_id: + type: string + jstor_id: + type: string volume: type: string issue: @@ -379,6 +392,9 @@ definitions: language: description: "Two-letter RFC1766/ISO639-1 language code, with extensions" type: string + license_slug: + type: string + description: "Short version of license name. Eg, 'CC-BY'" contribs: type: array items: @@ -587,11 +603,14 @@ definitions: description: "Optional; GET-only" raw_name: type: string + role: + type: string + raw_affiliation: + type: string + description: "Raw affiliation string as displayed in text" extra: type: object additionalProperties: {} - role: - type: string auth_oidc: type: object required: @@ -686,6 +705,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -943,6 +970,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -1223,6 +1258,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -1485,6 +1528,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -1713,6 +1764,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -1941,6 +2000,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -2194,6 +2261,14 @@ paths: in: query type: string required: false + - name: arxiv_id + in: query + type: string + required: false + - name: jstor_id + in: query + type: string + required: false - name: expand in: query type: string @@ -2284,6 +2359,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true diff --git a/notes/bootstrap/import_timing_20190116.txt b/notes/bootstrap/import_timing_20190116.txt new file mode 100644 index 00000000..96723ca8 --- /dev/null +++ b/notes/bootstrap/import_timing_20190116.txt @@ -0,0 +1,492 @@ + +## master / eea40c6c713a35e19eb005a322b2075018a32e3e + + sudo service fatcat-api stop + # as postgres user: diesel database reset + sudo service elasticsearch stop + sudo service postgresql restart + sudo service fatcat-api start + # reset postgres stats + + time ./fatcat_import.py issn /srv/fatcat/datasets/journal_extra_metadata.csv + + Processed 53300 lines, inserted 53283, updated 0. + real 1m10.618s + user 0m8.812s + sys 0m0.316s + + + time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py orcid - + + Processed 48900 lines, inserted 48731, updated 0. (times 80x) + real 17m16.242s + user 26m45.464s + sys 1m37.052s + + ~300 TPS + 35% fatcatd CPU + bunch of python3 around 25-30% CPU. overall only ~12% CPU. + all disk write. autovacuum in progress (why? is this a naive importer?) + + time zcat /srv/fatcat/datasets/crossref-works.2018-09-05.1mil.json.gz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + + ~200% fatcatd CPU; 75% parallel; then postgres; then python3 + overall 40% user, 6% system + 10808 TPS (!) + autovacuum on release_rev + TODO: logs are going to rsyslog and disk (causing disk contention) + + Processed 50450 lines, inserted 46166, updated 0. (unsure how many chunks) + 905112inputs+2125192outputs (9major+2694248minor)pagefaults 0swaps + real 5m54.368s + user 33m1.724s + sys 1m52.404s + + + # did not run: + #time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata - + + +Results: + + table_name | table_size | indexes_size | total_size + --------------------------------------------------------------+------------+--------------+------------ + "public"."creator_rev" | 371 MB | 457 MB | 827 MB + "public"."creator_edit" | 377 MB | 420 MB | 797 MB + "public"."creator_ident" | 255 MB | 412 MB | 667 MB + "public"."release_rev" | 382 MB | 261 MB | 643 MB + "public"."release_ref" | 437 MB | 152 MB | 589 MB + "public"."release_contrib" | 154 MB | 141 MB | 295 MB + "public"."release_edit" | 89 MB | 99 MB | 188 MB + "public"."work_edit" | 89 MB | 98 MB | 187 MB + "public"."release_ident" | 60 MB | 97 MB | 157 MB + "public"."work_ident" | 60 MB | 97 MB | 157 MB + "public"."work_rev" | 39 MB | 36 MB | 75 MB + "public"."container_rev" | 16 MB | 6040 kB | 22 MB + "public"."editgroup" | 8520 kB | 14 MB | 22 MB + "public"."abstracts" | 19 MB | 1648 kB | 21 MB + "public"."changelog" | 6632 kB | 6600 kB | 13 MB + "public"."container_edit" | 5824 kB | 6208 kB | 12 MB + "public"."container_ident" | 4160 kB | 6376 kB | 10 MB + "public"."release_rev_abstract" | 2448 kB | 3000 kB | 5448 kB + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan + ----------------------+--------------+----------------+-----------+----------+---------- + release_ref | 6 | Missing Index? | 457768960 | 6 | 0 + release_rev_abstract | 3 | Missing Index? | 2473984 | 3 | 0 + release_contrib | 3 | Missing Index? | 161882112 | 3 | 0 + release_edit | -14934 | OK | 93069312 | 2 | 14936 + work_edit | -14936 | OK | 93003776 | 2 | 14938 + container_edit | -14977 | OK | 5931008 | 2 | 14979 + creator_edit | -14977 | OK | 395403264 | 2 | 14979 + abstracts | -39370 | OK | 19357696 | 1 | 39371 + changelog | -108251 | OK | 6766592 | 627 | 108878 + container_rev | -179447 | OK | 17104896 | 3 | 179450 + container_ident | -590029 | OK | 4235264 | 3 | 590032 + release_ident | -929184 | OK | 62881792 | 5331 | 934515 + work_rev | -1837772 | OK | 40828928 | 1 | 1837773 + work_ident | -1845253 | OK | 62873600 | 6980 | 1852233 + creator_ident | -3930575 | OK | 267001856 | 3 | 3930578 + editgroup | -5848807 | OK | 8691712 | 31878 | 5880685 + release_rev | -6081392 | OK | 400916480 | 9 | 6081401 + creator_rev | -7818340 | OK | 388743168 | 3 | 7818343 + + select count(*) from release_ref; + 1701210 + + Size: 4.61G + + +## citation-efficiency / 8a0d963beb2fa6766a7141df39dc322abea1b9a0 + + sudo service fatcat-api stop + # as postgres user: diesel database reset + sudo service elasticsearch stop + sudo service postgresql restart + sudo service fatcat-api start + + + time ./fatcat_import.py issn /srv/fatcat/datasets/journal_extra_metadata.csv + + Processed 53300 lines, inserted 53283, updated 0. + + real 1m9.867s + user 0m8.580s + sys 0m0.292s + + + time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py orcid - + + real 17m20.110s + user 26m56.660s + sys 1m37.280s + + + time zcat /srv/fatcat/datasets/crossref-works.2018-09-05.1mil.json.gz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + + 12043 TPS + 225% fatcatd, 33% total usr CPU, 5% sys + autovacuum in progress + + 1916.81user 110.41system 5:23.08elapsed 627%CPU (0avgtext+0avgdata 411056maxresident)k + 47920inputs+2021736outputs (4major+2734186minor)pagefaults 0swaps + real 5m23.095s + user 32m10.964s + sys 1m51.800s + + table_name | table_size | indexes_size | total_size + --------------------------------------------------------------+------------+--------------+------------ + "public"."creator_rev" | 371 MB | 456 MB | 827 MB + "public"."creator_edit" | 377 MB | 421 MB | 798 MB + "public"."creator_ident" | 255 MB | 412 MB | 667 MB + "public"."release_rev" | 385 MB | 260 MB | 645 MB + "public"."release_contrib" | 154 MB | 141 MB | 295 MB + "public"."refs_blob" | 197 MB | 6064 kB | 203 MB + "public"."release_edit" | 89 MB | 98 MB | 187 MB + "public"."work_edit" | 89 MB | 98 MB | 187 MB + "public"."work_ident" | 60 MB | 97 MB | 157 MB + "public"."release_ident" | 60 MB | 97 MB | 156 MB + "public"."work_rev" | 39 MB | 36 MB | 75 MB + "public"."container_rev" | 16 MB | 6056 kB | 22 MB + "public"."editgroup" | 8472 kB | 13 MB | 22 MB + "public"."abstracts" | 19 MB | 1672 kB | 21 MB + "public"."changelog" | 6544 kB | 6592 kB | 13 MB + "public"."container_edit" | 5832 kB | 6096 kB | 12 MB + "public"."container_ident" | 4120 kB | 6352 kB | 10 MB + "public"."release_rev_abstract" | 2432 kB | 3040 kB | 5472 kB + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan + ----------------------+--------------+----------------+-----------+----------+---------- + release_rev_abstract | 3 | Missing Index? | 2457600 | 3 | 0 + release_contrib | 3 | Missing Index? | 161800192 | 3 | 0 + release_edit | -14348 | OK | 93044736 | 2 | 14350 + work_edit | -14419 | OK | 93044736 | 2 | 14421 + creator_edit | -14977 | OK | 395321344 | 2 | 14979 + container_edit | -14977 | OK | 5939200 | 2 | 14979 + abstracts | -39364 | OK | 19365888 | 1 | 39365 + changelog | -109498 | OK | 6676480 | 2 | 109500 + refs_blob | -141113 | OK | 50651136 | 1 | 141114 + container_rev | -179441 | OK | 17096704 | 3 | 179444 + container_ident | -589642 | OK | 4194304 | 3 | 589645 + release_ident | -929175 | OK | 62840832 | 3548 | 932723 + work_rev | -1836990 | OK | 40787968 | 1 | 1836991 + work_ident | -1844214 | OK | 62849024 | 4071 | 1848285 + creator_ident | -3930572 | OK | 267010048 | 3 | 3930575 + release_rev | -4378297 | OK | 403906560 | 9 | 4378306 + editgroup | -5911871 | OK | 8642560 | 3 | 5911874 + creator_rev | -7818337 | OK | 388710400 | 3 | 7818340 + + select count(*) from release_ref; + 7 + + select count(*) from refs_blob; + 70169 + + Size: 4.23G + +Total row size is more than halved, and index by almost two orders of +magnitude, with about the same (or even faster) insert time. Success! + +Raw release JSON (crossref-works.2018-09-05.1mil.json) was first million lines +(not randomized). 217 MB gzip, about 2.1 GB uncompressed. + +Continuing with a full import... + + time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + + 217758.13user 10299.22system 19:56:34elapsed 317%CPU (0avgtext+0avgdata 437384maxresident)k + 88774752inputs+356480424outputs (573520major+35809039minor)pagefaults 0swaps + + real 1196m34.828s => 20 hours + user 3713m5.376s + sys 177m48.364s + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan +----------------------+--------------+------+-------------+----------+----------- + release_edit | -89909 | OK | 8641216512 | 2 | 89911 + work_edit | -89980 | OK | 8641134592 | 2 | 89982 + creator_edit | -90540 | OK | 395321344 | 8 | 90548 + container_edit | -90544 | OK | 9494528 | 4 | 90548 + release_contrib | -918913 | OK | 22936829952 | 3 | 918916 + release_rev_abstract | -919036 | OK | 271998976 | 3 | 919039 + container_rev | -1029775 | OK | 21168128 | 3 | 1029778 + changelog | -1941085 | OK | 117219328 | 2 | 1941087 + abstracts | -4633441 | OK | 2443132928 | 1 | 4633442 + creator_ident | -7165562 | OK | 267010048 | 3 | 7165565 + creator_rev | -9432011 | OK | 388710400 | 3 | 9432014 + refs_blob | -34911929 | OK | 12710191104 | 2 | 34911931 + container_ident | -66613383 | OK | 7626752 | 3 | 66613386 + release_ident | -86429880 | OK | 5833670656 | 3548 | 86433428 + work_rev | -170840559 | OK | 3784466432 | 1 | 170840560 + work_ident | -170923520 | OK | 5833400320 | 4071 | 170927591 + editgroup | -176784137 | OK | 149798912 | 3 | 176784140 + release_rev | -478829274 | OK | 43008122880 | 9 | 478829283 + + table_name | table_size | indexes_size | total_size +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_rev" | 40 GB | 23 GB | 63 GB + "public"."refs_blob" | 54 GB | 1479 MB | 56 GB + "public"."release_contrib" | 21 GB | 19 GB | 40 GB + "public"."release_edit" | 8243 MB | 9154 MB | 17 GB + "public"."work_edit" | 8243 MB | 9150 MB | 17 GB + "public"."release_ident" | 5565 MB | 9017 MB | 14 GB + "public"."work_ident" | 5565 MB | 9006 MB | 14 GB + "public"."work_rev" | 3610 MB | 3343 MB | 6953 MB + "public"."abstracts" | 2382 MB | 191 MB | 2573 MB + "public"."creator_rev" | 371 MB | 456 MB | 827 MB + "public"."creator_edit" | 377 MB | 421 MB | 798 MB + "public"."creator_ident" | 255 MB | 412 MB | 667 MB + "public"."release_rev_abstract" | 259 MB | 336 MB | 596 MB + "public"."editgroup" | 143 MB | 237 MB | 380 MB + "public"."changelog" | 112 MB | 116 MB | 228 MB + "public"."container_rev" | 20 MB | 9248 kB | 29 MB + "public"."container_edit" | 9304 kB | 8872 kB | 18 MB + "public"."container_ident" | 7472 kB | 9112 kB | 16 MB + + Size: 234.86G + + + time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates - + + Processed 531750 lines, inserted 455063, updated 0. + Command exited with non-zero status 1 + 12904.53user 485.22system 2:10:40elapsed 170%CPU (0avgtext+0avgdata 63300maxresident)k + 98696inputs+3452984outputs (154major+287476minor)pagefaults 0swaps + + real 130m40.181s + user 215m34.456s + sys 8m10.064s + + + time zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - + + Processed 486350 lines, inserted 247482, updated 171906. + 20591.62user 844.63system 3:21:40elapsed 177%CPU (0avgtext+0avgdata 41488maxresident)k + 80768inputs+2522808outputs (110major+251307minor)pagefaults 0swaps + + real 201m40.885s + user 343m34.616s + sys 14m8.364s + + + time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata - + + Processed 133303 lines, inserted 123905, updated 0. + 10636.02user 424.26system 2:20:14elapsed 131%CPU (0avgtext+0avgdata 65232maxresident)k + 23984inputs+20285752outputs (119major+1344072minor)pagefaults 0swaps + + real 140m14.397s + user 180m0.592s + sys 7m30.664s + +Re-summarize: + + select count(*) from file_rev_release; + 12510846 + + select count(*) from container_ident where is_live='f'; + 0 + select count(*) from release_ident where is_live='f'; + 0 + select count(*) from work_ident where is_live='f'; + 0 + select count(*) from creator_ident where is_live='f'; + 1 + select count(*) from file_ident where is_live='f'; + 0 + + Size: 261.59G + => releases and works about 242 GB of this + + table_name | table_size | indexes_size | total_size +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_rev" | 40 GB | 24 GB | 64 GB + "public"."refs_blob" | 58 GB | 1576 MB | 60 GB + "public"."release_contrib" | 22 GB | 19 GB | 41 GB + "public"."release_edit" | 8386 MB | 9300 MB | 17 GB + "public"."work_edit" | 8386 MB | 9297 MB | 17 GB + "public"."release_ident" | 5690 MB | 9199 MB | 15 GB + "public"."work_ident" | 5690 MB | 9191 MB | 15 GB + "public"."file_rev_url" | 6151 MB | 2346 MB | 8496 MB + "public"."work_rev" | 3673 MB | 3402 MB | 7075 MB + "public"."abstracts" | 3807 MB | 277 MB | 4085 MB + "public"."file_rev" | 1403 MB | 2309 MB | 3712 MB + "public"."file_edit" | 1181 MB | 1236 MB | 2417 MB + "public"."file_rev_release" | 721 MB | 1266 MB | 1987 MB + "public"."file_ident" | 691 MB | 1163 MB | 1854 MB + "public"."release_rev_abstract" | 374 MB | 495 MB | 869 MB + "public"."creator_rev" | 371 MB | 456 MB | 827 MB + "public"."creator_edit" | 377 MB | 421 MB | 798 MB + "public"."creator_ident" | 255 MB | 412 MB | 667 MB + "public"."editgroup" | 194 MB | 299 MB | 493 MB + "public"."changelog" | 134 MB | 138 MB | 272 MB + "public"."container_rev" | 20 MB | 9248 kB | 29 MB + "public"."container_edit" | 9304 kB | 8872 kB | 18 MB + "public"."container_ident" | 7472 kB | 9112 kB | 16 MB + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan +----------------------+--------------+------+-------------+----------+----------- + release_edit | -486322 | OK | 8791547904 | 14 | 486336 + work_edit | -486391 | OK | 8791498752 | 8 | 486399 + file_edit | -486945 | OK | 1237671936 | 4 | 486949 + creator_edit | -486949 | OK | 395321344 | 8 | 486957 + container_edit | -486953 | OK | 9494528 | 4 | 486957 + container_rev | -1029946 | OK | 21168128 | 3 | 1029949 + file_rev_url | -2166783 | OK | 6448095232 | 2 | 2166785 + file_rev_release | -2166811 | OK | 756015104 | 7 | 2166818 + changelog | -7336464 | OK | 140369920 | 2 | 7336466 + abstracts | -7447647 | OK | 3716759552 | 1 | 7447648 + creator_ident | -7561970 | OK | 267010048 | 6 | 7561976 + creator_rev | -9432017 | OK | 388710400 | 3 | 9432020 + release_contrib | -11915853 | OK | 23410876416 | 3 | 11915856 + release_rev_abstract | -11917411 | OK | 392249344 | 3 | 11917414 + file_ident | -23530866 | OK | 724213760 | 60366 | 23591232 + refs_blob | -40651974 | OK | 13605445632 | 2 | 40651976 + container_ident | -67010119 | OK | 7626752 | 5 | 67010124 + file_rev | -84478325 | OK | 1470947328 | 10 | 84478335 + release_ident | -114803381 | OK | 5964980224 | 3551 | 114806932 + work_rev | -173810916 | OK | 3850354688 | 1 | 173810917 + work_ident | -177260615 | OK | 5964554240 | 4074 | 177264689 + editgroup | -192178637 | OK | 203137024 | 3 | 192178640 + release_rev | -501596237 | OK | 43460804608 | 9 | 501596246 +(23 rows) + + + bnewbold@wbgrp-svc500$ df -h / + Filesystem Size Used Avail Use% Mounted on + /dev/vda1 858G 409G 407G 51% / + + bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/11/main/ + 263G /var/lib/postgresql/11/main/ + +At this point slightly contaminated by running re-import for a minute before cancelling... + + select count(*) from refs_blob; + 18,517,091 + + select count(*) from release_ident; + 86,905,233 + +Re-importing crossref dump to simulate double size database: + + time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + + Processed 4989650 lines, inserted 538846, updated 0. (etc) + 208873.41user 8805.46system 12:40:59elapsed 476%CPU (0avgtext+0avgdata 419868maxresident)k + 43945800inputs+268799736outputs (210727major+21027251minor)pagefaults 0swaps + + real 760m59.324s + user 3567m3.524s + sys 152m56.692s + + table_name | table_size | indexes_size | total_size +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_rev" | 47 GB | 27 GB | 74 GB + "public"."refs_blob" | 72 GB | 1898 MB | 73 GB + "public"."release_contrib" | 26 GB | 23 GB | 49 GB + "public"."release_edit" | 9418 MB | 10 GB | 19 GB + "public"."work_edit" | 9418 MB | 10 GB | 19 GB + "public"."work_ident" | 6386 MB | 10 GB | 16 GB + "public"."release_ident" | 6387 MB | 10 GB | 16 GB + "public"."file_rev_url" | 6151 MB | 2346 MB | 8496 MB + "public"."work_rev" | 4125 MB | 3828 MB | 7952 MB + "public"."abstracts" | 4134 MB | 303 MB | 4437 MB + "public"."file_rev" | 1403 MB | 2309 MB | 3712 MB + "public"."file_edit" | 1181 MB | 1236 MB | 2417 MB + "public"."file_rev_release" | 721 MB | 1266 MB | 1987 MB + "public"."file_ident" | 691 MB | 1163 MB | 1854 MB + "public"."release_rev_abstract" | 409 MB | 539 MB | 948 MB + "public"."editgroup" | 340 MB | 543 MB | 883 MB + "public"."creator_rev" | 371 MB | 456 MB | 827 MB + "public"."creator_edit" | 377 MB | 421 MB | 798 MB + "public"."creator_ident" | 255 MB | 412 MB | 667 MB + "public"."changelog" | 250 MB | 259 MB | 508 MB + "public"."container_rev" | 20 MB | 9272 kB | 29 MB + "public"."container_edit" | 9472 kB | 8880 kB | 18 MB + "public"."container_ident" | 7592 kB | 9136 kB | 16 MB + + Size: 301.96G + + select count(*) from refs_blob; + 22,322,741 + + select count(*) from release_ident; + 97,597,519 + + changelog: 4,286,296 + +Huh. Expected this to basically double size... what happened? Doing fetches? + +So... it was doing fetches (no 'no_release_updates' flag passed), but still +inserted 5 million? also not good. + + time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 --no-release-updates + + Processed 5001450 lines, inserted 4811233, updated 0. + 138166.58user 5605.61system 43:47:54elapsed 91%CPU (0avgtext+0avgdata 426964maxresident)k + 208379288inputs+488675440outputs (267864major+49077696minor)pagefaults 0swaps + + real 2627m55.003s + user 2386m11.476s + sys 99m0.408s + + table_name | table_size | indexes_size | total_size + --------------------------------------------------------------+------------+--------------+------------ + "public"."release_rev" | 93 GB | 53 GB | 146 GB + "public"."release_contrib" | 52 GB | 45 GB | 97 GB + "public"."refs_blob" | 72 GB | 1898 MB | 73 GB + "public"."release_edit" | 18 GB | 20 GB | 39 GB + "public"."work_edit" | 18 GB | 20 GB | 39 GB + "public"."work_ident" | 12 GB | 20 GB | 32 GB + "public"."release_ident" | 12 GB | 20 GB | 32 GB + "public"."work_rev" | 8185 MB | 7596 MB | 15 GB + "public"."file_rev_url" | 6151 MB | 2346 MB | 8496 MB + "public"."abstracts" | 4134 MB | 303 MB | 4437 MB + "public"."file_rev" | 1403 MB | 2309 MB | 3712 MB + "public"."file_edit" | 1181 MB | 1236 MB | 2417 MB + "public"."file_rev_release" | 721 MB | 1266 MB | 1987 MB + "public"."file_ident" | 691 MB | 1163 MB | 1854 MB + "public"."release_rev_abstract" | 700 MB | 919 MB | 1619 MB + "public"."editgroup" | 486 MB | 788 MB | 1275 MB + "public"."creator_rev" | 371 MB | 456 MB | 827 MB + "public"."creator_edit" | 377 MB | 421 MB | 798 MB + "public"."changelog" | 365 MB | 381 MB | 746 MB + "public"."creator_ident" | 255 MB | 412 MB | 667 MB + "public"."container_rev" | 20 MB | 9272 kB | 29 MB + "public"."container_edit" | 9472 kB | 8880 kB | 18 MB + "public"."container_ident" | 7592 kB | 9136 kB | 16 MB + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan + ----------------------+--------------+------+--------------+----------+------------ + release_edit | -487544 | OK | 19594010624 | 26 | 487570 + work_edit | -487615 | OK | 19594043392 | 26 | 487641 + file_edit | -488168 | OK | 1237671936 | 19 | 488187 + creator_edit | -488173 | OK | 395321344 | 26 | 488199 + container_edit | -488306 | OK | 9666560 | 49 | 488355 + file_rev_url | -2166808 | OK | 6448095232 | 2 | 2166810 + file_rev_release | -2166881 | OK | 756015104 | 7 | 2166888 + container_rev | -2264841 | OK | 21364736 | 3 | 2264844 + changelog | -11338986 | OK | 382525440 | 2 | 11338988 + creator_rev | -12726261 | OK | 388710400 | 3 | 12726264 + creator_ident | -14563891 | OK | 267010048 | 6 | 14563897 + abstracts | -15594992 | OK | 4052975616 | 1 | 15594993 + file_ident | -23532116 | OK | 724213760 | 60366 | 23592482 + file_rev | -84478438 | OK | 1470947328 | 10 | 84478448 + release_contrib | -97501069 | OK | 55310950400 | 3 | 97501072 + release_rev_abstract | -97505413 | OK | 734248960 | 3 | 97505416 + refs_blob | -108179066 | OK | 15747162112 | 11 | 108179077 + container_ident | -152392399 | OK | 7749632 | 5 | 152392404 + release_ident | -307197678 | OK | 13256884224 | 3557 | 307201235 + work_rev | -387420683 | OK | 8580505600 | 1 | 387420684 + work_ident | -390871805 | OK | 13256515584 | 4074 | 390875879 + editgroup | -409831715 | OK | 509853696 | 3 | 409831718 + release_rev | -1112440989 | OK | 100107378688 | 9 | 1112440998 + + Size: 501.37G + + select count(*) from refs_blob; + 22,322,742 + + select count(*) from release_ident; + 193,709,943 + diff --git a/notes/schema/alignments.txt b/notes/schema/alignments.txt index e7678d93..7fc37606 100644 --- a/notes/schema/alignments.txt +++ b/notes/schema/alignments.txt @@ -27,9 +27,25 @@ Specifically, the "variables" and type definitions: <http://docs.citationstyles. - rights/license (for explicit OA) - version (eg, for software, standards) - url (eg, for blog posts and other web content; canonical only) +- authority (for things like patents) +- collection_title (for book series) +- short_title +- edition (eg, "4th") +- event (eg, conference) +- chapter_number +- submitted + +"extra" for citations: +- most of the above, or any fields from 'release" +- authors (an array) +- url +- issue, volume, date, edition +- accessed_date + +release_date aligns with... 'issued'? not original-date +pages aligns with 'page'. Should this be 'locator'? other things: -- align cite-items even closer with CSL? assuming this is what crossref is doing - anything specially needed for a blog post? url (original/canonical)? - press_release diff --git a/notes/schema/mag_schema_comparison.txt b/notes/schema/mag_schema_comparison.txt new file mode 100644 index 00000000..0328ff7e --- /dev/null +++ b/notes/schema/mag_schema_comparison.txt @@ -0,0 +1,65 @@ + +Looking at the Microsoft Academic Graph schema: https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema + +My take-aways from this are: + +- should allow storing raw affiliations today in release_contrib rows, and some + day have a foreign key to institution there +- maybe should have an "original_title" field for releases? though could go in + 'extra' (along with subtitle) +- have a well-known 'extra' key to use saving citation context in references + + +## Data Model (high-level) + +Includes rich affiliation (at the per-paper level) and "field of study" +tagging. + +No work/release distinction. + +There are URLs, but no file-level metadata. + +Don't store full abstracts for legal reasons. + + +## Details (lower-level) + +Across many entities, there are "normalized" and "display" names. + +Some stats are aggregated: paper and citation counts + +#### Affilitions + +Institution names: "normalized" vs. "display" + +"GRID" id? + +What is the WikiPage? Wikipedia? + +#### Authors + +Saves "last known" affiliation. + +#### Field of Study + +Nested hierarchy + +#### Citations + +"Context" table stores... presumably text around the citaiton itself. + +"References" table stores little metadata about the citation itself. + +#### Papers + +Paper URLs now have types (an int). + +"Paper Title" / "Original Title" / "Book Title" + +Year and Date separately (same as fatcat) + +Stores first and last page separately. + +"Original Venue" (string), presumably name of the container/journal + +Has arbitrary resources (URLs) diff --git a/python/Pipfile b/python/Pipfile index eebdab36..b04bb91a 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -32,6 +32,7 @@ python-dateutil = "*" sickle = "*" python-snappy = "*" pymacaroons = "*" +ftfy= "*" [requires] # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 diff --git a/python/Pipfile.lock b/python/Pipfile.lock index 296079f0..f2d39a99 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "c3deb49cf4c122c2aed3f4f944e9763cfcf40c85891ca3d3e9cabc3debbb9075" + "sha256": "8f98bb3f6a3083c8b03cb68d1ee48b25449a950dd8a9d15189f2eb4fae48f760" }, "pipfile-spec": 6, "requires": { @@ -96,27 +96,27 @@ }, "cryptography": { "hashes": [ - "sha256:05a6052c6a9f17ff78ba78f8e6eb1d777d25db3b763343a1ae89a7a8670386dd", - "sha256:0eb83a24c650a36f68e31a6d0a70f7ad9c358fa2506dc7b683398b92e354a038", - "sha256:0ff4a3d6ea86aa0c9e06e92a9f986de7ee8231f36c4da1b31c61a7e692ef3378", - "sha256:1699f3e916981df32afdd014fb3164db28cdb61c757029f502cb0a8c29b2fdb3", - "sha256:1b1f136d74f411f587b07c076149c4436a169dc19532e587460d9ced24adcc13", - "sha256:21e63dd20f5e5455e8b34179ac43d95b3fb1ffa54d071fd2ed5d67da82cfe6dc", - "sha256:2454ada8209bbde97065453a6ca488884bbb263e623d35ba183821317a58b46f", - "sha256:3cdc5f7ca057b2214ce4569e01b0f368b3de9d8ee01887557755ccd1c15d9427", - "sha256:418e7a5ec02a7056d3a4f0c0e7ea81df374205f25f4720bb0e84189aa5fd2515", - "sha256:471a097076a7c4ab85561d7fa9a1239bd2ae1f9fd0047520f13d8b340bf3210b", - "sha256:5ecaf9e7db3ca582c6de6229525d35db8a4e59dc3e8a40a331674ed90e658cbf", - "sha256:63b064a074f8dc61be81449796e2c3f4e308b6eba04a241a5c9f2d05e882c681", - "sha256:6afe324dfe6074822ccd56d80420df750e19ac30a4e56c925746c735cf22ae8b", - "sha256:70596e90398574b77929cd87e1ac6e43edd0e29ba01e1365fed9c26bde295aa5", - "sha256:70c2b04e905d3f72e2ba12c58a590817128dfca08949173faa19a42c824efa0b", - "sha256:8908f1db90be48b060888e9c96a0dee9d842765ce9594ff6a23da61086116bb6", - "sha256:af12dfc9874ac27ebe57fc28c8df0e8afa11f2a1025566476b0d50cdb8884f70", - "sha256:b4fc04326b2d259ddd59ed8ea20405d2e695486ab4c5e1e49b025c484845206e", - "sha256:da5b5dda4aa0d5e2b758cc8dfc67f8d4212e88ea9caad5f61ba132f948bab859" - ], - "version": "==2.4.2" + "sha256:05b3ded5e88747d28ee3ef493f2b92cbb947c1e45cf98cfef22e6d38bb67d4af", + "sha256:06826e7f72d1770e186e9c90e76b4f84d90cdb917b47ff88d8dc59a7b10e2b1e", + "sha256:08b753df3672b7066e74376f42ce8fc4683e4fd1358d34c80f502e939ee944d2", + "sha256:2cd29bd1911782baaee890544c653bb03ec7d95ebeb144d714b0f5c33deb55c7", + "sha256:31e5637e9036d966824edaa91bf0aa39dc6f525a1c599f39fd5c50340264e079", + "sha256:42fad67d7072216a49e34f923d8cbda9edacbf6633b19a79655e88a1b4857063", + "sha256:4946b67235b9d2ea7d31307be9d5ad5959d6c4a8f98f900157b47abddf698401", + "sha256:522fdb2809603ee97a4d0ef2f8d617bc791eb483313ba307cb9c0a773e5e5695", + "sha256:6f841c7272645dd7c65b07b7108adfa8af0aaea57f27b7f59e01d41f75444c85", + "sha256:7d335e35306af5b9bc0560ca39f740dfc8def72749645e193dd35be11fb323b3", + "sha256:8504661ffe324837f5c4607347eeee4cf0fcad689163c6e9c8d3b18cf1f4a4ad", + "sha256:9260b201ce584d7825d900c88700aa0bd6b40d4ebac7b213857bd2babee9dbca", + "sha256:9a30384cc402eac099210ab9b8801b2ae21e591831253883decdb4513b77a3cd", + "sha256:9e29af877c29338f0cab5f049ccc8bd3ead289a557f144376c4fbc7d1b98914f", + "sha256:ab50da871bc109b2d9389259aac269dd1b7c7413ee02d06fe4e486ed26882159", + "sha256:b13c80b877e73bcb6f012813c6f4a9334fcf4b0e96681c5a15dac578f2eedfa0", + "sha256:bfe66b577a7118e05b04141f0f1ed0959552d45672aa7ecb3d91e319d846001e", + "sha256:e091bd424567efa4b9d94287a952597c05d22155a13716bf5f9f746b9dc906d3", + "sha256:fa2b38c8519c5a3aa6e2b4e1cf1a549b54acda6adb25397ff542068e73d1ed00" + ], + "version": "==2.5" }, "fatcat-client": { "editable": true, @@ -152,6 +152,14 @@ "index": "pypi", "version": "==0.2" }, + "ftfy": { + "hashes": [ + "sha256:84a1614190173bb447ac9d581e50185c6aa35b538754b6bedaba0cc0f83d8e80", + "sha256:fa74757fb7cb444366fa6a79c2feabd40281a44dfbf6eaed492a804764ee26b2" + ], + "index": "pypi", + "version": "==5.5.1" + }, "idna": { "hashes": [ "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", @@ -366,6 +374,13 @@ ], "version": "==1.24.1" }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + }, "werkzeug": { "hashes": [ "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", @@ -558,10 +573,10 @@ }, "parso": { "hashes": [ - "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2", - "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24" + "sha256:4b8f9ed80c3a4a3191aa3261505d868aa552dd25649cb13a7d73b6b7315edf2d", + "sha256:5a120be2e8863993b597f1c0437efca799e90e0793c98ae5d4e34ebd00140e31" ], - "version": "==0.3.1" + "version": "==0.3.2" }, "pathlib2": { "hashes": [ @@ -595,10 +610,10 @@ }, "pluggy": { "hashes": [ - "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095", - "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f" + "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616", + "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a" ], - "version": "==0.8.0" + "version": "==0.8.1" }, "prompt-toolkit": { "hashes": [ @@ -610,38 +625,38 @@ }, "psycopg2": { "hashes": [ - "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e", - "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237", - "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f", - "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300", - "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076", - "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221", - "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82", - "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92", - "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3", - "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6", - "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e", - "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae", - "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535", - "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f", - "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26", - "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7", - "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53", - "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b", - "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc", - "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee", - "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20", - "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d", - "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9", - "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244", - "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f", - "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b", - "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370", - "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9", - "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d", - "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0" - ], - "version": "==2.7.6.1" + "sha256:02445ebbb3a11a3fe8202c413d5e6faf38bb75b4e336203ee144ca2c46529f94", + "sha256:0e9873e60f98f0c52339abf8f0339d1e22bfe5aae0bcf7aabd40c055175035ec", + "sha256:1148a5eb29073280bf9057c7fc45468592c1bb75a28f6df1591adb93c8cb63d0", + "sha256:259a8324e109d4922b0fcd046e223e289830e2568d6f4132a3702439e5fd532b", + "sha256:28dffa9ed4595429e61bacac41d3f9671bb613d1442ff43bcbec63d4f73ed5e8", + "sha256:314a74302d4737a3865d40ea50e430ce1543c921ba10f39d562e807cfe2edf2a", + "sha256:36b60201b6d215d7658a71493fdf6bd5e60ad9a0cffed39906627ff9f4f3afd3", + "sha256:3f9d532bce54c4234161176ff3b8688ff337575ca441ea27597e112dfcd0ee0c", + "sha256:5d222983847b40af989ad96c07fc3f07e47925e463baa5de716be8f805b41d9b", + "sha256:6757a6d2fc58f7d8f5d471ad180a0bd7b4dd3c7d681f051504fbea7ae29c8d6f", + "sha256:6a0e0f1e74edb0ab57d89680e59e7bfefad2bfbdf7c80eb38304d897d43674bb", + "sha256:6ca703ccdf734e886a1cf53eb702261110f6a8b0ed74bcad15f1399f74d3f189", + "sha256:8513b953d8f443c446aa79a4cc8a898bd415fc5e29349054f03a7d696d495542", + "sha256:9262a5ce2038570cb81b4d6413720484cb1bc52c064b2f36228d735b1f98b794", + "sha256:97441f851d862a0c844d981cbee7ee62566c322ebb3d68f86d66aa99d483985b", + "sha256:a07feade155eb8e69b54dd6774cf6acf2d936660c61d8123b8b6b1f9247b67d6", + "sha256:a9b9c02c91b1e3ec1f1886b2d0a90a0ea07cc529cb7e6e472b556bc20ce658f3", + "sha256:ae88216f94728d691b945983140bf40d51a1ff6c7fe57def93949bf9339ed54a", + "sha256:b360ffd17659491f1a6ad7c928350e229c7b7bd83a2b922b6ee541245c7a776f", + "sha256:b4221957ceccf14b2abdabef42d806e791350be10e21b260d7c9ce49012cc19e", + "sha256:b90758e49d5e6b152a460d10b92f8a6ccf318fcc0ee814dcf53f3a6fc5328789", + "sha256:c669ea986190ed05fb289d0c100cc88064351f2b85177cbfd3564c4f4847d18c", + "sha256:d1b61999d15c79cf7f4f7cc9021477aef35277fc52452cf50fd13b713c84424d", + "sha256:de7bb043d1adaaf46e38d47e7a5f703bb3dab01376111e522b07d25e1a79c1e1", + "sha256:e393568e288d884b94d263f2669215197840d097c7e5b0acd1a51c1ea7d1aba8", + "sha256:ed7e0849337bd37d89f2c2b0216a0de863399ee5d363d31b1e5330a99044737b", + "sha256:f153f71c3164665d269a5d03c7fa76ba675c7a8de9dc09a4e2c2cdc9936a7b41", + "sha256:f1fb5a8427af099beb7f65093cbdb52e021b8e6dbdfaf020402a623f4181baf5", + "sha256:f36b333e9f86a2fba960c72b90c34be6ca71819e300f7b1fc3d2b0f0b2c546cd", + "sha256:f4526d078aedd5187d0508aa5f9a01eae6a48a470ed678406da94b4cd6524b7e" + ], + "version": "==2.7.7" }, "ptyprocess": { "hashes": [ @@ -674,11 +689,11 @@ }, "pytest": { "hashes": [ - "sha256:3e65a22eb0d4f1bdbc1eacccf4a3198bf8d4049dea5112d70a0c61b00e748d02", - "sha256:5924060b374f62608a078494b909d341720a050b5224ff87e17e12377486a71d" + "sha256:41568ea7ecb4a68d7f63837cf65b92ce8d0105e43196ff2b26622995bb3dc4b2", + "sha256:c3c573a29d7c9547fb90217ece8a8843aa0c1328a797e200290dc3d0b4b823be" ], "index": "pypi", - "version": "==4.1.0" + "version": "==4.1.1" }, "pytest-cov": { "hashes": [ @@ -727,30 +742,30 @@ }, "typed-ast": { "hashes": [ - "sha256:0555eca1671ebe09eb5f2176723826f6f44cca5060502fea259de9b0e893ab53", - "sha256:0ca96128ea66163aea13911c9b4b661cb345eb729a20be15c034271360fc7474", - "sha256:16ccd06d614cf81b96de42a37679af12526ea25a208bce3da2d9226f44563868", - "sha256:1e21ae7b49a3f744958ffad1737dfbdb43e1137503ccc59f4e32c4ac33b0bd1c", - "sha256:37670c6fd857b5eb68aa5d193e14098354783b5138de482afa401cc2644f5a7f", - "sha256:46d84c8e3806619ece595aaf4f37743083f9454c9ea68a517f1daa05126daf1d", - "sha256:5b972bbb3819ece283a67358103cc6671da3646397b06e7acea558444daf54b2", - "sha256:6306ffa64922a7b58ee2e8d6f207813460ca5a90213b4a400c2e730375049246", - "sha256:6cb25dc95078931ecbd6cbcc4178d1b8ae8f2b513ae9c3bd0b7f81c2191db4c6", - "sha256:7e19d439fee23620dea6468d85bfe529b873dace39b7e5b0c82c7099681f8a22", - "sha256:7f5cd83af6b3ca9757e1127d852f497d11c7b09b4716c355acfbebf783d028da", - "sha256:81e885a713e06faeef37223a5b1167615db87f947ecc73f815b9d1bbd6b585be", - "sha256:94af325c9fe354019a29f9016277c547ad5d8a2d98a02806f27a7436b2da6735", - "sha256:b1e5445c6075f509d5764b84ce641a1535748801253b97f3b7ea9d948a22853a", - "sha256:cb061a959fec9a514d243831c514b51ccb940b58a5ce572a4e209810f2507dcf", - "sha256:cc8d0b703d573cbabe0d51c9d68ab68df42a81409e4ed6af45a04a95484b96a5", - "sha256:da0afa955865920edb146926455ec49da20965389982f91e926389666f5cf86a", - "sha256:dc76738331d61818ce0b90647aedde17bbba3d3f9e969d83c1d9087b4f978862", - "sha256:e7ec9a1445d27dbd0446568035f7106fa899a36f55e52ade28020f7b3845180d", - "sha256:f741ba03feb480061ab91a465d1a3ed2d40b52822ada5b4017770dfcb88f839f", - "sha256:fe800a58547dd424cd286b7270b967b5b3316b993d86453ede184a17b5a6b17d" + "sha256:023625bfa9359e29bd6e24cac2a4503495b49761d48a5f1e38333fc4ac4d93fe", + "sha256:07591f7a5fdff50e2e566c4c1e9df545c75d21e27d98d18cb405727ed0ef329c", + "sha256:153e526b0f4ffbfada72d0bb5ffe8574ba02803d2f3a9c605c8cf99dfedd72a2", + "sha256:3ad2bdcd46a4a1518d7376e9f5016d17718a9ed3c6a3f09203d832f6c165de4a", + "sha256:3ea98c84df53ada97ee1c5159bb3bc784bd734231235a1ede14c8ae0775049f7", + "sha256:51a7141ccd076fa561af107cfb7a8b6d06a008d92451a1ac7e73149d18e9a827", + "sha256:52c93cd10e6c24e7ac97e8615da9f224fd75c61770515cb323316c30830ddb33", + "sha256:6344c84baeda3d7b33e157f0b292e4dd53d05ddb57a63f738178c01cac4635c9", + "sha256:64699ca1b3bd5070bdeb043e6d43bc1d0cebe08008548f4a6bee782b0ecce032", + "sha256:74903f2e56bbffe29282ef8a5487d207d10be0f8513b41aff787d954a4cf91c9", + "sha256:7891710dba83c29ee2bd51ecaa82f60f6bede40271af781110c08be134207bf2", + "sha256:91976c56224e26c256a0de0f76d2004ab885a29423737684b4f7ebdd2f46dde2", + "sha256:9bad678a576ecc71f25eba9f1e3fd8d01c28c12a2834850b458428b3e855f062", + "sha256:b4726339a4c180a8b6ad9d8b50d2b6dc247e1b79b38fe2290549c98e82e4fd15", + "sha256:ba36f6aa3f8933edf94ea35826daf92cbb3ec248b89eccdc053d4a815d285357", + "sha256:bbc96bde544fd19e9ef168e4dfa5c3dfe704bfa78128fa76f361d64d6b0f731a", + "sha256:c0c927f1e44469056f7f2dada266c79b577da378bbde3f6d2ada726d131e4824", + "sha256:c0f9a3708008aa59f560fa1bd22385e05b79b8e38e0721a15a8402b089243442", + "sha256:f0bf6f36ff9c5643004171f11d2fdc745aa3953c5aacf2536a0685db9ceb3fb1", + "sha256:f5be39a0146be663cbf210a4d95c3c58b2d7df7b043c9047c5448e358f0550a2", + "sha256:fcd198bf19d9213e5cbf2cde2b9ef20a9856e716f76f9476157f90ae6de06cc6" ], "markers": "python_version < '3.7' and implementation_name == 'cpython'", - "version": "==1.1.1" + "version": "==1.2.0" }, "urllib3": { "hashes": [ @@ -768,9 +783,9 @@ }, "wrapt": { "hashes": [ - "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" + "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533" ], - "version": "==1.10.11" + "version": "==1.11.1" } } } diff --git a/python/README_import.md b/python/README_import.md index cc9a94e1..2465940b 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -26,11 +26,13 @@ the others: wget https://archive.org/download/ia_papers_manifest_2018-01-25/index/idents_files_urls.sqlite.gz wget https://archive.org/download/ia_journal_metadata_explore_2018-04-05/journal_extra_metadata.csv wget https://archive.org/download/issn_issnl_mappings/20180216.ISSN-to-ISSN-L.txt - wget https://archive.org/download/orcid-dump-2017/public_profiles_API-2.0_2017_10_json.tar.gz + wget https://archive.org/download/orcid-dump-2017/public_profiles_1_2_json.all.json.gz wget https://archive.org/download/ia_journal_pid_map_munge_20180908/release_ids.ia_munge_20180908.sqlite3.gz wget https://archive.org/download/ia_test_paper_matches/2018-08-27-2352.17-matchcrossref.insertable.json.gz wget https://archive.org/download/ia_papers_manifest_2018-01-25_matched/ia_papers_manifest_2018-01-25.matched.json.gz + gunzip public_profiles_1_2_json.all.json.gz + ## ISSN From CSV file: @@ -54,13 +56,14 @@ Usually 24 hours or so on fast production machine. ## Matched -Unknown speed! +These each take 2-4 hours: # No file update for the first import... - zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates - + time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates - # ... but do on the second zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - # GROBID extracted (release+file) time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata - + diff --git a/python/env.example b/python/env.example index c986b9d2..75fc5238 100644 --- a/python/env.example +++ b/python/env.example @@ -1,4 +1,5 @@ -FLASK_SECRET_KEY="" +FLASK_SECRET_KEY="TODO-REPLACE-ME" +FATCAT_DOMAIN="dev.fatcat.wiki" # This key used in tests FATCAT_API_AUTH_TOKEN="AgEPZGV2LmZhdGNhdC53aWtpAhYyMDE5MDEwMS1kZXYtZHVtbXkta2V5AAImZWRpdG9yX2lkID0gYWFhYWFhYWFhYWFhYmt2a2FhYWFhYWFhYWkAAht0aW1lID4gMjAxOS0wMS0wOVQwMDo1Nzo1MloAAAYgnroNha1hSftChtxHGTnLEmM/pY8MeQS/jBSV0UNvXug=" FATCAT_API_HOST="http://localhost:9411/v0" @@ -14,6 +15,5 @@ SENTRY_DSN="" # FATCAT_API_AUTH_TOKEN FATCAT_AUTH_WORKER_CROSSREF="" FATCAT_AUTH_WORKER_ORCID="" -FATCAT_AUTH_WORKER_ISSN="" -FATCAT_AUTH_WORKER_MATCHED="" -FATCAT_AUTH_WORKER_GROBID_METADATA="" +FATCAT_AUTH_WORKER_PUBMED="" +FATCAT_AUTH_WORKER_DATACITE="" diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 0e176b2c..a47aa175 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -1,47 +1,40 @@ #!/usr/bin/env python3 -""" -""" - import os, sys, argparse from fatcat_tools import authenticated_api -from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \ - IssnImporter, MatchedImporter, GrobidMetadataImporter, make_kafka_consumer +from fatcat_tools.importers import * def run_crossref(args): - fci = CrossrefImporter(args.api, args.issn_map_file, + fci = CrossrefImporter(args.api, + args.issn_map_file, extid_map_file=args.extid_map_file, - create_containers=(not args.no_create_containers), - check_existing=(not args.no_release_updates)) + edit_batch_size=args.batch_size, + bezerk_mode=args.bezerk_mode) if args.kafka_mode: - consumer = make_kafka_consumer( - args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import") - fci.process_batch(consumer, size=args.batch_size, decode_kafka=True) + KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import").run() else: - fci.process_batch(args.json_file, size=args.batch_size) - fci.describe_run() + JsonLinePusher(fci).run() def run_orcid(args): - foi = OrcidImporter(args.api) - foi.process_batch(args.json_file, size=args.batch_size) - foi.describe_run() + foi = OrcidImporter(args.api, + edit_batch_size=args.batch_size) + JsonLinePusher(foi, args.json_file).run() -def run_issn(args): - fii = IssnImporter(args.api) - fii.process_csv_batch(args.csv_file, size=args.batch_size) - fii.describe_run() +def run_journal_metadata(args): + fii = JournalMetadataImporter(args.api, + edit_batch_size=args.batch_size) + CsvLinePusher(fii, args.csv_file).run() def run_matched(args): fmi = MatchedImporter(args.api, - skip_file_updates=args.no_file_updates) - fmi.process_batch(args.json_file, size=args.batch_size) - fmi.describe_run() + bezerk_mode=args.bezerk_mode, + edit_batch_size=args.batch_size) + JsonLinePusher(fmi, args.json_file).run() def run_grobid_metadata(args): - fmi = GrobidMetadataImporter(args.api) - fmi.process_source(args.tsv_file, group_size=args.group_size) - fmi.describe_run() + fmi = GrobidMetadataImporter(args.api, edit_batch_size=args.batch_size, longtail_oa=args.longtail_oa) + LinePusher(fmi, args.tsv_file).run() def main(): parser = argparse.ArgumentParser() @@ -73,18 +66,15 @@ def main(): sub_crossref.add_argument('--extid-map-file', help="DOI-to-other-identifiers sqlite3 database", default=None, type=str) - sub_crossref.add_argument('--no-create-containers', - action='store_true', - help="skip creation of new container entities based on ISSN") sub_crossref.add_argument('--batch-size', help="size of batch to send", default=50, type=int) sub_crossref.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") - sub_crossref.add_argument('--no-release-updates', + sub_crossref.add_argument('--bezerk-mode', action='store_true', - help="don't lookup existing DOIs, just insert (only for bootstrap)") + help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") sub_orcid = subparsers.add_parser('orcid') sub_orcid.set_defaults( @@ -98,37 +88,37 @@ def main(): help="size of batch to send", default=50, type=int) - sub_issn = subparsers.add_parser('issn') - sub_issn.set_defaults( - func=run_issn, - auth_var="FATCAT_AUTH_WORKER_ISSN", + sub_journal_metadata = subparsers.add_parser('journal-metadata') + sub_journal_metadata.set_defaults( + func=run_journal_metadata, + auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) - sub_issn.add_argument('csv_file', + sub_journal_metadata.add_argument('csv_file', help="Journal ISSN CSV metadata file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) - sub_issn.add_argument('--batch-size', + sub_journal_metadata.add_argument('--batch-size', help="size of batch to send", default=50, type=int) sub_matched = subparsers.add_parser('matched') sub_matched.set_defaults( func=run_matched, - auth_var="FATCAT_AUTH_WORKER_MATCHED", + auth_var="FATCAT_API_AUTH_TOKEN", ) sub_matched.add_argument('json_file', help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) - sub_matched.add_argument('--no-file-updates', - action='store_true', - help="don't lookup existing files, just insert (only for bootstrap)") sub_matched.add_argument('--batch-size', help="size of batch to send", default=50, type=int) + sub_matched.add_argument('--bezerk-mode', + action='store_true', + help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") sub_grobid_metadata = subparsers.add_parser('grobid-metadata') sub_grobid_metadata.set_defaults( func=run_grobid_metadata, - auth_var="FATCAT_AUTH_WORKER_GROBID_METADATA", + auth_var="FATCAT_API_AUTH_TOKEN", ) sub_grobid_metadata.add_argument('tsv_file', help="TSV file to import from (or stdin)", @@ -136,6 +126,9 @@ def main(): sub_grobid_metadata.add_argument('--group-size', help="editgroup group size to use", default=75, type=int) + sub_matched.add_argument('--longtail-oa', + action='store_true', + help="if this is an import of longtail OA content (sets an 'extra' flag)") args = parser.parse_args() if not args.__dict__.get("func"): @@ -144,6 +137,7 @@ def main(): args.api = authenticated_api( args.host_url, + # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var)) args.func(args) diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index e6f081e5..70f38f5b 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -1,7 +1,22 @@ -from .common import FatcatImporter, make_kafka_consumer +""" +To run an import you combine two classes; one each of: + +- RecordSource: somehow iterates over a source of raw records (eg, from a + database, Kafka, files on disk, stdin) and pushes into an entity importer. +- EntityImporter: class that a record iterator pushes raw (unparsed) records + into. The entity importer parses and decides what to do (ignore, update, + insert, etc). There is usually a primary entity type, though related entities + can be created along the way. Maintains API connection and editgroup/batch + state. + +""" + +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .grobid_metadata import GrobidMetadataImporter -from .issn import IssnImporter +from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter from .orcid import OrcidImporter +#from .kafka_source import KafkaSource +#from .file_source import FileSource diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 06897bee..89203a4f 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,6 +3,7 @@ import re import sys import csv import json +import ftfy import itertools import subprocess from collections import Counter @@ -12,30 +13,66 @@ import fatcat_client from fatcat_client.rest import ApiException -# from: https://docs.python.org/3/library/itertools.html -def grouper(iterable, n, fillvalue=None): - "Collect data into fixed-length chunks or blocks" - args = [iter(iterable)] * n - return itertools.zip_longest(*args, fillvalue=fillvalue) +def clean(thing, force_xml=False): + """ + This function is appropriate to be called on any random, non-markup string, + such as author names, titles, etc. -def make_kafka_consumer(hosts, env, topic_suffix, group): - topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8') - client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0") - consume_topic = client.topics[topic_name] - print("Consuming from kafka topic {}, group {}".format(topic_name, group)) + It will try to clean up commong unicode mangles, HTML characters, etc. - consumer = consume_topic.get_balanced_consumer( - consumer_group=group.encode('utf-8'), - managed=True, - auto_commit_enable=True, - auto_commit_interval_ms=30000, # 30 seconds - compacted_topic=True, - ) - return consumer + This will detect XML/HTML and "do the right thing" (aka, not remove + entities like '&' if there are tags in the string), unless you pass the + 'force_xml' parameter, which might be appropriate for, eg, names and + titles, which generally should be projected down to plain text. + + Also strips extra whitespace. + """ + if not thing: + return thing + fix_entities = 'auto' + if force_xml: + fix_entities = True + fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip() + if not fixed: + # wasn't zero-length before, but is now; return None + return None + return fixed + +def test_clean(): -class FatcatImporter: + assert clean(None) == None + assert clean('') == '' + assert clean('123') == '123' + assert clean('a&b') == 'a&b' + assert clean('<b>a&b</b>') == '<b>a&b</b>' + assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + +class EntityImporter: """ - Base class for fatcat importers + Base class for fatcat entity importers. + + The API exposed to record iterator is: + + push_record(raw_record) + finish() + + The API that implementations are expected to fill in are: + + want(raw_record) -> boolean + parse(raw_record) -> entity + try_update(entity) -> boolean + insert_batch([entity]) -> None + + This class exposes helpers for implementations: + + self.api + self.create_<entity>(entity) -> EntityEdit + for related entity types + self.push_entity(entity) + self.counts['exists'] += 1 + if didn't update or insert because of existing) + self.counts['update'] += 1 + if updated an entity """ def __init__(self, api, **kwargs): @@ -43,87 +80,135 @@ class FatcatImporter: eg_extra = kwargs.get('editgroup_extra', dict()) eg_extra['git_rev'] = eg_extra.get('git_rev', subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter') + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter') self.api = api - self._editgroup_description = kwargs.get('editgroup_description') - self._editgroup_extra = kwargs.get('editgroup_extra') - issn_map_file = kwargs.get('issn_map_file') + self.bezerk_mode = kwargs.get('bezerk_mode', False) + self.edit_batch_size = kwargs.get('edit_batch_size', 100) + self.editgroup_description = kwargs.get('editgroup_description') + self.editgroup_extra = kwargs.get('editgroup_extra') + self.reset() self._issnl_id_map = dict() self._orcid_id_map = dict() - self._doi_id_map = dict() - if issn_map_file: - self.read_issn_map_file(issn_map_file) self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") - self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0}) + self._doi_id_map = dict() - def _editgroup(self): - eg = fatcat_client.Editgroup( - description=self._editgroup_description, - extra=self._editgroup_extra, - ) - return self.api.create_editgroup(eg) + def reset(self): + self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + self._edit_count = 0 + self._editgroup_id = None + self._entity_queue = [] - def describe_run(self): - print("Processed {} lines, inserted {}, updated {}.".format( - self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) + def push_record(self, raw_record): + """ + Returns nothing. + """ + if (not raw_record) or (not self.want(raw_record)): + self.counts['skip'] += 1 + return + entity = self.parse_record(raw_record) + if not entity: + self.counts['skip'] += 1 + return + if self.bezerk_mode: + self.push_entity(entity) + return + if self.try_update(entity): + self.push_entity(entity) + return - def create_row(self, row, editgroup_id=None): - # sub-classes expected to implement this - raise NotImplementedError + def finish(self): + if self._edit_count > 0: + self.api.accept_editgroup(self._editgroup_id) + self._editgroup_id = None + self._edit_count = 0 + + if self._entity_queue: + self.insert_batch(self._entity_queue) + self.counts['insert'] += len(self._entity_queue) + self._entity_queue = [] + + self.counts['total'] = 0 + for key in ('skip', 'insert', 'update', 'exists'): + self.counts['total'] += self.counts[key] + return self.counts + + def _get_editgroup(self, edits=1): + if self._edit_count >= self.edit_batch_size: + self.api.accept_editgroup(self._editgroup_id) + self._editgroup_id = None + self._edit_count = 0 - def create_batch(self, rows, editgroup_id=None): - # sub-classes expected to implement this + if not self._editgroup_id: + eg = self.api.create_editgroup( + fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra)) + self._editgroup_id = eg.editgroup_id + + self._edit_count += edits + return self._editgroup_id + + def create_container(self, entity): + eg_id = self._get_editgroup() + self.counts['inserted.container'] += 1 + return self.api.create_container(entity, editgroup_id=eg_id) + + def create_release(self, entity): + eg_id = self._get_editgroup() + self.counts['inserted.release'] += 1 + return self.api.create_release(entity, editgroup_id=eg_id) + + def create_file(self, entity): + eg_id = self._get_editgroup() + self.counts['inserted.file'] += 1 + return self.api.create_file(entity, editgroup_id=eg_id) + + def updated(self): + """ + Implementations should call this from try_update() if the update was successful + """ + self.counts['update'] += 1 + + def push_entity(self, entity): + self._entity_queue.append(entity) + if len(self._entity_queue) >= self.edit_batch_size: + self.insert_batch(self._entity_queue) + self.counts['insert'] += len(_entity_queue) + self._entity_queue = 0 + + def want(self, raw_record): + """ + Implementations can override for optional fast-path to drop a record. + Must have no side-effects; returns bool. + """ + return True + + def parse(self, raw_record): + """ + Returns an entity class type, or None if we should skip this one. + + May have side-effects (eg, create related entities), but shouldn't + update/mutate the actual entity. + """ raise NotImplementedError - def process_source(self, source, group_size=100): - """Creates and auto-accepts editgroup every group_size rows""" - eg = self._editgroup() - i = 0 - for i, row in enumerate(source): - self.create_row(row, editgroup_id=eg.editgroup_id) - if i > 0 and (i % group_size) == 0: - self.api.accept_editgroup(eg.editgroup_id) - eg = self._editgroup() - self.counts['processed_lines'] += 1 - if i == 0 or (i % group_size) != 0: - self.api.accept_editgroup(eg.editgroup_id) - - def process_batch(self, source, size=50, decode_kafka=False): - """Reads and processes in batches (not API-call-per-)""" - for rows in grouper(source, size): - if decode_kafka: - rows = [msg.value.decode('utf-8') for msg in rows] - self.counts['processed_lines'] += len(rows) - #eg = self._editgroup() - #self.create_batch(rows, editgroup_id=eg.editgroup_id) - self.create_batch(rows) - - def process_csv_source(self, source, group_size=100, delimiter=','): - reader = csv.DictReader(source, delimiter=delimiter) - self.process_source(reader, group_size) - - def process_csv_batch(self, source, size=50, delimiter=','): - reader = csv.DictReader(source, delimiter=delimiter) - self.process_batch(reader, size) + def try_update(self, raw_record): + """ + Passed the output of parse(). Should try to find an existing entity and + update it (PUT), decide we should do nothing (based on the existing + record), or create a new one. - def is_issnl(self, issnl): - return len(issnl) == 9 and issnl[4] == '-' + Implementations must update the exists/updated/skip counts + appropriately in this method. - def lookup_issnl(self, issnl): - """Caches calls to the ISSN-L lookup API endpoint in a local dict""" - if issnl in self._issnl_id_map: - return self._issnl_id_map[issnl] - container_id = None - try: - rv = self.api.lookup_container(issnl=issnl) - container_id = rv.ident - except ApiException as ae: - # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 - self._issnl_id_map[issnl] = container_id # might be None - return container_id + Returns boolean: True if the entity should still be inserted, False otherwise + """ + raise NotImplementedError + + def insert_batch(self, raw_record): + raise NotImplementedError def is_orcid(self, orcid): return self._orcid_regex.match(orcid) is not None @@ -163,6 +248,23 @@ class FatcatImporter: self._doi_id_map[doi] = release_id # might be None return release_id + def is_issnl(self, issnl): + return len(issnl) == 9 and issnl[4] == '-' + + def lookup_issnl(self, issnl): + """Caches calls to the ISSN-L lookup API endpoint in a local dict""" + if issnl in self._issnl_id_map: + return self._issnl_id_map[issnl] + container_id = None + try: + rv = self.api.lookup_container(issnl=issnl) + container_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._issnl_id_map[issnl] = container_id # might be None + return container_id + def read_issn_map_file(self, issn_map_file): print("Loading ISSN map file...") self._issn_issnl_map = dict() @@ -179,3 +281,117 @@ class FatcatImporter: if issn is None: return None return self._issn_issnl_map.get(issn) + + +class RecordPusher: + """ + Base class for different importer sources. Pretty trivial interface, just + wraps an importer and pushes records in to it. + """ + + def __init__(self, importer, **kwargs): + self.importer = importer + + def run(self): + """ + This will look something like: + + for line in sys.stdin: + record = json.loads(line) + self.importer.push_record(record) + print(self.importer.finish()) + """ + raise NotImplementedError + + +class JsonLinePusher(RecordPusher): + + def __init__(self, importer, json_file, **kwargs): + self.importer = importer + self.json_file = json_file + + def run(self): + for line in self.json_file: + if not line: + continue + record = json.loads(line) + self.importer.push_record(record) + counts = self.importer.finish() + print(counts) + return counts + + +class CsvPusher(RecordPusher): + + def __init__(self, importer, csv_file, **kwargs): + self.importer = importer + self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ',')) + + def run(self): + for line in self.reader: + if not line: + continue + self.importer.push_record(line) + counts = self.importer.finish() + print(counts) + return counts + + +class LinePusher(RecordPusher): + + def __init__(self, importer, text_file, **kwargs): + self.importer = importer + self.text_file = text_file + + def run(self): + for line in self.text_file: + if not line: + continue + self.importer.push_record(line) + counts = self.importer.finish() + print(counts) + return counts + + +class KafkaJsonPusher(RecordPusher): + + def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): + self.importer = importer + self.consumer = make_kafka_consumer( + kafka_hosts, + kafka_env, + topic_suffix, + group, + ) + + def run(self): + count = 0 + for msg in self.consumer: + if not msg: + continue + record = json.loads(msg.value.decode('utf-8')) + self.importer.push_record(record) + count += 1 + if count % 500 == 0: + print("Import counts: {}".format(self.importer.counts)) + # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or + # commit the current batch if it has been lingering + counts = self.importer.finish() + print(counts) + return counts + + +def make_kafka_consumer(hosts, env, topic_suffix, group): + topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8') + client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0") + consume_topic = client.topics[topic_name] + print("Consuming from kafka topic {}, group {}".format(topic_name, group)) + + consumer = consume_topic.get_balanced_consumer( + consumer_group=group.encode('utf-8'), + managed=True, + auto_commit_enable=True, + auto_commit_interval_ms=30000, # 30 seconds + compacted_topic=True, + ) + return consumer diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 6365e491..00c719f1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -6,7 +6,7 @@ import datetime import itertools import subprocess import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean # The docs/guide should be the cannonical home for these mappings; update there @@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = { 'standard': 'standard', } -class CrossrefImporter(FatcatImporter): +CONTAINER_TYPE_MAP = { + 'article-journal': 'journal', + 'paper-conference': 'conference', + 'book': 'book-series', +} + +# TODO: +LICENSE_SLUG_MAP = { + "http://creativecommons.org/licenses/by/3.0/": "CC-BY", + "http://creativecommons.org/licenses/by/4.0/": "CC-BY", + "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", + "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", + # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license + # http://www.springer.com/tdm doesn't seem like a license +} + +class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. @@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter): issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra) + + self.create_containers = kwargs.get('create_containers') extid_map_file = kwargs.get('extid_map_file') - create_containers = kwargs.get('create_containers') - check_existing = kwargs.get('check_existing') self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter): self.extid_map_db = sqlite3.connect(db_uri, uri=True) else: print("Not using external ID map") - self.create_containers = create_containers - self.check_existing = check_existing + + self.read_issn_map_file(issn_map_file) def lookup_ext_ids(self, doi): if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = [str(cell or '') or None for cell in row] return dict( core_id=row[0], pmid=row[1], pmcid=row[2], - wikidata_qid=row[3]) + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) def map_release_type(self, crossref_type): return CROSSREF_TYPE_MAP.get(crossref_type) - def parse_crossref_dict(self, obj): + def map_container_type(self, crossref_type): + return CONTAINER_TYPE_MAP.get(crossref_type) + + def want(self, obj): + if not obj.get('title'): + return False + + # do most of these checks in-line below + return True + + def parse_record(self, obj): """ obj is a python dict (parsed from json). returns a ReleaseEntity """ - # Do require the 'title' keys to exsit, as release entities do - if (not 'title' in obj) or (not obj['title']): - return None - # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now if obj.get('type') in (None, 'journal', 'proceedings', @@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter): 'book-track', 'proceedings-series'): return None - # lookup existing DOI - existing_release = None - if self.check_existing: - try: - existing_release = self.api.lookup_release(doi=obj['DOI'].lower()) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - - # eventually we'll want to support "updates", but for now just skip if - # entity already exists - if existing_release: + # Do require the 'title' keys to exsit, as release entities do + if (not 'title' in obj) or (not obj['title']): return None + release_type = self.map_release_type(obj['type']) + # contribs def do_contribs(obj_list, ctype): contribs = [] @@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter): index = i else: index = None + raw_affiliation = None if am.get('affiliation'): - # note: affiliation => affiliations - extra['affiliations'] = am.get('affiliation') + if len(am.get('affiliation')) > 0: + raw_affiliation = am.get('affiliation')[0]['name'] + if len(am.get('affiliation')) > 1: + # note: affiliation => more_affiliations + extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] if am.get('sequence') and am.get('sequence') != "additional": - extra['sequence'] = am.get('sequence') + extra['seq'] = clean(am.get('sequence')) if not extra: extra = None assert ctype in ("author", "editor", "translator") contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, - raw_name=raw_name, + raw_name=clean(raw_name), + raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) return contribs @@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter): container_id = self.lookup_issnl(issnl) publisher = obj.get('publisher') - ce = None if (container_id is None and self.create_containers and (issnl is not None) and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, - publisher=publisher, - name=obj['container-title'][0]) + publisher=clean(publisher), + container_type=self.map_container_type(release_type), + name=clean(obj['container-title'][0], force_xml=True)) + ce_edit = self.create_container(ce) + container_id = ce_edit.ident + + # license slug + license_slug = None + license_extra = [] + for l in obj.get('license', []): + if l['content-version'] not in ('vor', 'unspecified'): + continue + slug = LICENSE_SLUG_MAP.get(l['URL']) + if slug: + license_slug = slug + if 'start' in l: + l['start'] = l['start']['date-time'] + license_extra.append(l) # references refs = [] for i, rm in enumerate(obj.get('reference', [])): try: year = int(rm.get('year')) - # NOTE: will need to update/config in the future! + # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year > 2025 or year < 100: year = None except: year = None - extra = rm.copy() - if rm.get('DOI'): - extra['doi'] = rm.get('DOI').lower() key = rm.get('key') if key and key.startswith(obj['DOI'].upper()): key = key.replace(obj['DOI'].upper() + "-", '') @@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter): container_name = rm.get('volume-title') if not container_name: container_name = rm.get('journal-title') - extra.pop('DOI', None) - extra.pop('key', None) - extra.pop('year', None) - extra.pop('volume-name', None) - extra.pop('journal-title', None) - extra.pop('title', None) - extra.pop('first-page', None) - extra.pop('doi-asserted-by', None) + elif rm.get('journal-title'): + extra['journal-title'] = rm['journal-title'] + extra = dict() + if rm.get('DOI'): + extra['doi'] = rm.get('DOI').lower() + # TODO: what fields here? CSL citation stuff + for k in ('author', 'editor', 'edition', 'authority', 'version', + 'genre', 'url', 'event', 'issue', 'volume', 'date', + 'accessed_date', 'issued', 'page', 'medium', + 'collection_title', 'chapter_number'): + if clean(rm.get(k)): + extra[k] = clean(rm[k]) if extra: extra = dict(crossref=extra) else: @@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter): target_release_id=None, key=key, year=year, - container_name=container_name, - title=rm.get('title'), - locator=rm.get('first-page'), + container_name=clean(container_name), + title=clean(rm.get('title')), + locator=clean(rm.get('first-page')), # TODO: just dump JSON somewhere here? extra=extra)) @@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter): if obj.get('abstract') != None: abstracts.append(fatcat_client.ReleaseEntityAbstracts( mimetype="application/xml+jats", - content=obj.get('abstract'))) + content=clean(obj.get('abstract')))) # extra fields extra = dict() - for key in ('subject', 'type', 'license', 'alternative-id', - 'container-title', 'original-title', 'subtitle', 'archive', - 'funder', 'group-title'): - # TODO: unpack "container-title" array + for key in ('subject', 'type', 'alternative-id', 'container-title', + 'subtitle', 'archive', 'funder', 'group-title'): + # TODO: unpack "container-title" array? val = obj.get(key) if val: - extra[key] = val - if 'license' in extra and extra['license']: - for i in range(len(extra['license'])): - if 'start' in extra['license'][i]: - extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] + if type(val) == str: + extra[key] = clean(val) + else: + extra[key] = val + if license_extra: + extra['license'] = license_extra + if len(obj['title']) > 1: - extra['other-titles'] = obj['title'][1:] - # TODO: this should be top-level - extra['is_kept'] = len(obj.get('archive', [])) > 0 + extra['other-titles'] = [clean(t) for t in obj['title'][1:]] # ISBN isbn13 = None @@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter): re = fatcat_client.ReleaseEntity( work_id=None, - title=obj.get('title', [None])[0], - contribs=contribs, - refs=refs, container_id=container_id, - publisher=publisher, - release_type=self.map_release_type(obj['type']), + title=clean(obj.get('title', [None])[0], force_xml=True), + original_title=clean(obj.get('original-title', [None])[0]), + release_type=release_type, release_status=release_status, + release_date=release_date, + release_year=release_year, + publisher=clean(publisher), doi=obj['DOI'].lower(), - isbn13=isbn13, - core_id=extids['core_id'], pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], - release_date=release_date, - release_year=release_year, - issue=obj.get('issue'), - volume=obj.get('volume'), - pages=obj.get('page'), + isbn13=isbn13, + core_id=extids['core_id'], + arxiv_id=extids['arxiv_id'], + jstor_id=extids['jstor_id'], + volume=clean(obj.get('volume')), + issue=clean(obj.get('issue')), + pages=clean(obj.get('page')), + language=None, # crossref doesn't supply language info + license_slug=license_slug, + extra=dict(crossref=extra), abstracts=abstracts, - extra=dict(crossref=extra)) - return (re, ce) + contribs=contribs, + refs=refs, + ) + return re + + def try_update(self, re): + + # lookup existing DOI (don't need to try other ext idents for crossref) + existing = None + try: + existing = self.api.lookup_release(doi=re.doi) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_release_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) - def create_row(self, row, editgroup_id=None): - if row is None: - return - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - container = self.api.create_container(ce, editgroup_id=editgroup_id) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - self.api.create_release(re, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - def create_batch(self, batch): - """Current work/release pairing disallows batch creation of releases. - Could do batch work creation and then match against releases, but meh.""" - release_batch = [] - for row in batch: - if row is None: - continue - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - ce_eg = self.api.create_editgroup(fatcat_client.Editgroup()) - container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id) - self.api.accept_editgroup(ce_eg.editgroup_id) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - release_batch.append(re) - self.api.create_release_batch(release_batch, autoaccept="true") - self.counts['insert'] += len(release_batch) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 5e61a154..9d95fe0b 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,12 +5,22 @@ import json import base64 import datetime import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean MAX_ABSTRACT_BYTES=4096 -class GrobidMetadataImporter(FatcatImporter): +class GrobidMetadataImporter(EntityImporter): + """ + This is a complex case: we need to parse and create both file and release entities. + + The "primary" entity here is really File, not Release. If a matching File + exists, we bail in want(); if not we insert the Release during parsing, and + insert both. + + TODO: should instead check if the File has any releases; if not, insert and update. + TODO: relaxing 'None' constraint on parse_record() might make this refactor-able. + """ def __init__(self, api, **kwargs): @@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra) self.default_link_rel = kwargs.get("default_link_rel", "web") + self.longtail_oa = kwargs.get("longtail_oa", False) + + def want(self, raw_record): + return True + + def parse_record(self, row): + + fields = row.split('\t') + sha1_key = fields[0] + cdx = json.loads(fields[1]) + mimetype = fields[2] + file_size = int(fields[3]) + grobid_meta = json.loads(fields[4]) + fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) + re = self.parse_grobid_json(grobid_meta) + + if not (fe and re): + return None + + # lookup existing file SHA1 + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # if file is already in here, presumably not actually long-tail + # HACK: this is doing an exists check in parse_record(), which is weird + # TODO: this is where we should check if the file actually has + # release_ids and/or URLs associated with it + if existing and not self.bezerk_mode: + self.counts['exists'] += 1 + self.counts['skip'] -= 1 + return None + + release_edit = self.create_release(re) + fe.release_ids.append(release_edit.ident) + return fe def parse_grobid_json(self, obj): @@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter): abobj = dict( mimetype="text/plain", language=None, - content=obj.get('abstract').strip()) + content=clean(obj.get('abstract'))) abstracts = [abobj] else: abstracts = None @@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter): for i, a in enumerate(obj.get('authors', [])): contribs.append(fatcat_client.ReleaseContrib( index=i, - raw_name=a['name'], + raw_name=clean(a['name']), role="author", extra=None)) + # XXX: why is this a dict()? not covered by tests? refs = [] for raw in obj.get('citations', []): cite_extra = dict() ref = dict() - ref['key'] = raw.get('id') + ref['key'] = clean(raw.get('id')) if raw.get('title'): - ref['title'] = raw['title'].strip() + ref['title'] = clean(raw['title']) if raw.get('date'): try: year = int(raw['date'].strip()[:4]) @@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter): pass for key in ('volume', 'url', 'issue', 'publisher'): if raw.get(key): - cite_extra[key] = raw[key].strip() + cite_extra[key] = clean(raw[key]) if raw.get('authors'): - cite_extra['authors'] = [a['name'] for a in raw['authors']] + cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] if cite_extra: cite_extra = dict(grobid=cite_extra) else: @@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter): if obj.get('doi'): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] - - extra['is_longtail_oa'] = True + extra['container_name'] = clean(obj['journal']['name']) # TODO: ISSN/eISSN handling? or just journal name lookup? + if self.longtail_oa: + extra['longtail_oa'] = True + if extra: extra = dict(grobid=extra) else: extra = None re = fatcat_client.ReleaseEntity( - title=obj['title'].strip(), + title=clean(obj['title'], force_xml=True), release_type="article-journal", release_date=release_date, release_year=release_year, contribs=contribs, refs=refs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), + publisher=clean(obj['journal'].get('publisher')), + volume=clean(obj['journal'].get('volume')), + issue=clean(obj['journal'].get('issue')), abstracts=abstracts, extra=extra) return re @@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter): sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() - # lookup existing SHA1, or create new entity - try: - existing_file = self.api.lookup_file(sha1=sha1) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - existing_file = None - - if existing_file: - # if file is already in here, presumably not actually long-tail - return None fe = fatcat_client.FileEntity( sha1=sha1, size=int(file_size), @@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter): # parse URLs and CDX original = cdx['url'] + assert len(cdx['dt']) >= 8 wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) @@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter): return fe - def create_row(self, row, editgroup_id=None): - if not row: - return - fields = row.split('\t') - sha1_key = fields[0] - cdx = json.loads(fields[1]) - mimetype = fields[2] - file_size = int(fields[3]) - grobid_meta = json.loads(fields[4]) - fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) - re = self.parse_grobid_json(grobid_meta) - if fe and re: - release_entity = self.api.create_release(re, editgroup_id=editgroup_id) - # release ident can't already be in release list because we just - # created it - fe.release_ids.append(release_entity.ident) - file_entity = self.api.create_file(fe, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - # NB: batch mode not implemented + def try_update(self, entity): + # did the exists check in 'parse_record()', because we needed to create a release + return True + + def insert_batch(self, batch): + self.api.create_file_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py deleted file mode 100644 index f4d525a4..00000000 --- a/python/fatcat_tools/importers/issn.py +++ /dev/null @@ -1,89 +0,0 @@ - -import sys -import json -import itertools -import fatcat_client -from .common import FatcatImporter - - -def or_none(s): - if s is None: - return None - if len(s) == 0: - return None - return s - -def truthy(s): - if s is None: - return None - s = s.lower() - - if s in ('true', 't', 'yes', 'y', '1'): - return True - elif s in ('false', 'f', 'no', 'n', '0'): - return False - else: - return None - -class IssnImporter(FatcatImporter): - """ - Imports journal metadata ("containers") by ISSN, currently from a custom - (data munged) .csv file format - - CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): - - ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count - """ - - def __init__(self, api, **kwargs): - - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra) - - def parse_issn_row(self, row): - """ - row is a python dict (parsed from CSV). - returns a ContainerEntity (or None if invalid or couldn't parse) - """ - title = or_none(row['title']) - issnl = or_none(row['ISSN-L']) - if title is None or issnl is None: - return None - extra = dict( - in_doaj=truthy(row['in_doaj']), - in_road=truthy(row['in_road']), - in_norwegian=truthy(row['in_norwegian']), - language=or_none(row['lang']), - url=or_none(row['url']), - ISSNp=or_none(row['ISSN-print']), - ISSNe=or_none(row['ISSN-electronic']), - is_oa=truthy(row['is_oa']), - is_kept=truthy(row['is_kept']), - ) - ce = fatcat_client.ContainerEntity( - issnl=issnl, - name=title, - publisher=or_none(row['publisher']), - abbrev=None, - coden=None, - extra=extra) - return ce - - def create_row(self, row, editgroup_id=None): - ce = self.parse_issn_row(row) - if ce is not None: - self.api.create_container(ce, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_issn_row(l) - for l in batch if (l is not None)] - objects = [o for o in objects if (o is not None)] - self.api.create_container_batch(objects, autoaccept="true") - self.counts['insert'] += len(objects) diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py new file mode 100644 index 00000000..cf3971b5 --- /dev/null +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -0,0 +1,183 @@ + +import sys +import json +import itertools +import fatcat_client +from .common import EntityImporter, clean + + +def or_none(s): + if s is None: + return None + if len(s) == 0: + return None + return s + +def truthy(s): + if s is None: + return None + s = s.lower() + + if s in ('true', 't', 'yes', 'y', '1'): + return True + elif s in ('false', 'f', 'no', 'n', '0'): + return False + else: + return None + +class JournalMetadataImporter(EntityImporter): + """ + Imports journal metadata ("containers") by ISSN, currently from a custom + (data munged) .csv file format + + CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): + + ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + + + 'extra' fields: + + doaj + as_of: datetime of most recent check; if not set, not actually in DOAJ + seal: bool + work_level: bool (are work-level publications deposited with DOAJ?) + archiving: array, can include 'library' or 'other' + road + as_of: datetime of most recent check; if not set, not actually in ROAD + pubmed (TODO: delete?) + as_of: datetime of most recent check; if not set, not actually indexed in pubmed + norwegian (TODO: drop this?) + as_of: datetime of most recent check; if not set, not actually indexed in pubmed + id (integer) + level (integer; 0-2) + kbart + lockss + year_rle + volume_rle + portico + ... + clockss + ... + sherpa_romeo + color + jstor + year_rle + volume_rle + scopus + id + TODO: print/electronic distinction? + wos + id + doi + crossref_doi: DOI of the title in crossref (if exists) + prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref) + ia + sim + nap_id + year_rle + volume_rle + longtail: boolean + homepage + as_of: datetime of last attempt + url + status: HTTP/heritrix status of homepage crawl + + issnp: string + issne: string + coden: string + abbrev: string + oclc_id: string (TODO: lookup?) + lccn_id: string (TODO: lookup?) + dblb_id: string + default_license: slug + original_name: native name (if name is translated) + platform: hosting platform: OJS, wordpress, scielo, etc + mimetypes: array of strings (eg, 'application/pdf', 'text/html') + first_year: year (integer) + last_year: if publishing has stopped + primary_language: single ISO code, or 'mixed' + languages: array of ISO codes + region: TODO: continent/world-region + nation: shortcode of nation + discipline: TODO: highest-level subject; "life science", "humanities", etc + field: TODO: narrower description of field + subjects: TODO? + url: homepage + is_oa: boolean. If true, can assume all releases under this container are "Open Access" + TODO: domains, if exclusive? + TODO: fulltext_regex, if a known pattern? + + For KBART, etc: + We "over-count" on the assumption that "in-progress" status works will soon actually be preserved. + year and volume spans are run-length-encoded arrays, using integers: + - if an integer, means that year is preserved + - if an array of length 2, means everything between the two numbers (inclusive) is preserved + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + + def want(self, raw_record): + if raw_record.get('ISSN-L'): + return True + return False + + def parse_record(self, row): + """ + row is a python dict (parsed from CSV). + returns a ContainerEntity (or None if invalid or couldn't parse) + """ + title = or_none(row['title']) + issnl = or_none(row['ISSN-L']) + if title is None or issnl is None: + return None + extra = dict( + in_doaj=truthy(row['in_doaj']), + in_road=truthy(row['in_road']), + in_norwegian=truthy(row['in_norwegian']), + language=or_none(row['lang']), + url=or_none(row['url']), + ISSNp=or_none(row['ISSN-print']), + ISSNe=or_none(row['ISSN-electronic']), + is_oa=truthy(row['is_oa']), + is_kept=truthy(row['is_kept']), + ) + ce = fatcat_client.ContainerEntity( + issnl=issnl, + name=clean(title), + publisher=or_none(clean(row['publisher'])), + extra=extra) + return ce + + def try_update(self, ce): + + existing = None + try: + existing = self.api.lookup_container(issnl=ce.issnl) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_container_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 1e5c22f7..2ec6c95d 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,16 +4,10 @@ import json import sqlite3 import itertools import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean -#row = row.split('\t') -#assert len(row) == 2 -#sha1 = row[0].replace('sha1:') -#sha1 = base64.b16encode(base64.b32decode(sha1)).lower() -#print(sha1) -#dois = [d.lower() for d in json.loads(row[1])] -class MatchedImporter(FatcatImporter): +class MatchedImporter(EntityImporter): """ Importer for "file to crossref DOI" matches. @@ -48,7 +42,6 @@ class MatchedImporter(FatcatImporter): editgroup_extra=eg_extra) self.default_link_rel = kwargs.get("default_link_rel", "web") self.default_mime = kwargs.get("default_mime", None) - self.skip_file_updates = kwargs.get("skip_file_updates", False) def make_url(self, raw): rel = self.default_link_rel @@ -59,26 +52,13 @@ class MatchedImporter(FatcatImporter): rel = "repository" elif "//web.archive.org/" in raw or "//archive.is/" in raw: rel = "webarchive" - return fatcat_client.FileEntityUrls(url=raw, rel=rel) + return (rel, raw) - def parse_matched_dict(self, obj): - sha1 = obj['sha1'] - dois = [d.lower() for d in obj.get('dois', [])] + def want(self, raw_record): + return True - # lookup sha1, or create new entity - fe = None - if not self.skip_file_updates: - try: - fe = self.api.lookup_file(sha1=sha1) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - if fe is None: - fe = fatcat_client.FileEntity( - sha1=sha1, - release_ids=[], - urls=[], - ) + def parse_record(self, obj): + dois = [d.lower() for d in obj.get('dois', [])] # lookup dois re_list = set() @@ -93,67 +73,77 @@ class MatchedImporter(FatcatImporter): print("DOI not found: {}".format(doi)) else: re_list.add(re.ident) - if len(re_list) == 0: + release_ids = list(re_list) + if len(release_ids) == 0: return None - if fe.release_ids == set(re_list): - return None - re_list.update(fe.release_ids) - fe.release_ids = list(re_list) # parse URLs and CDX - existing_urls = [feu.url for feu in fe.urls] + urls = set() for url in obj.get('url', []): - if url not in existing_urls: - url = self.make_url(url) - if url != None: - fe.urls.append(url) + url = self.make_url(url) + if url != None: + urls.add(url) for cdx in obj.get('cdx', []): original = cdx['url'] wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) - if wayback not in existing_urls: - fe.urls.append( - fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) - if original not in existing_urls: - url = self.make_url(original) - if url != None: - fe.urls.append(url) - - if obj.get('size') != None: - fe.size = int(obj['size']) - fe.sha256 = obj.get('sha256', fe.sha256) - fe.md5 = obj.get('md5', fe.sha256) - if obj.get('mimetype') is None: - if fe.mimetype is None: - fe.mimetype = self.default_mime - else: - fe.mimetype = obj.get('mimetype') + urls.add(("webarchive", wayback)) + url = self.make_url(original) + if url != None: + urls.add(url) + urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls] + if len(urls) == 0: + return None + + size = obj.get('size') + if size: + size = int(size) + + fe = fatcat_client.FileEntity( + md5=obj.get('md5'), + sha1=obj['sha1'], + sha256=obj.get('sha256'), + size=size, + mimetype=obj.get('mimetype'), + release_ids=release_ids, + urls=urls, + ) return fe - def create_row(self, row, editgroup_id=None): - obj = json.loads(row) - fe = self.parse_matched_dict(obj) - if fe is not None: - if fe.ident is None: - self.api.create_file(fe, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - else: - self.api.update_file(fe.ident, fe, editgroup_id=editgroup_id) - self.counts['update'] += 1 - - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_matched_dict(json.loads(l)) - for l in batch if l != None] - new_objects = [o for o in objects if o != None and o.ident == None] - update_objects = [o for o in objects if o != None and o.ident != None] - if len(update_objects): - update_eg = self._editgroup().editgroup_id - for obj in update_objects: - self.api.update_file(obj.ident, obj, editgroup_id=update_eg) - self.api.accept_editgroup(update_eg) - if len(new_objects) > 0: - self.api.create_file_batch(new_objects, autoaccept="true") - self.counts['update'] += len(update_objects) - self.counts['insert'] += len(new_objects) + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + + fe.release_ids = list(set(fe.release_ids + existing.release_ids)) + if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0: + # no new release matches *and* there are already existing URLs + self.counts['exists'] += 1 + return False + + # merge the existing into this one and update + existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) + existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] + existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + existing.mimetype = existing.mimetype or fe.mimetype + existing.size = existing.size or fe.size + existing.md5 = existing.md5 or fe.md5 + existing.sha256 = existing.sha256 or fe.sha256 + self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup()) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_file_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 0c8b1d62..02c9bf00 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -3,7 +3,7 @@ import sys import json import itertools import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean def value_or_none(e): if type(e) == dict: @@ -20,7 +20,7 @@ def value_or_none(e): return None return e -class OrcidImporter(FatcatImporter): +class OrcidImporter(EntityImporter): def __init__(self, api, **kwargs): @@ -32,14 +32,16 @@ class OrcidImporter(FatcatImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra) - def parse_orcid_dict(self, obj): + def want(self, raw_record): + return True + + def parse_record(self, obj): """ obj is a python dict (parsed from json). returns a CreatorEntity """ name = obj['person']['name'] - if name is None: - return None + assert name extra = None given = value_or_none(name.get('given-names')) sur = value_or_none(name.get('family-name')) @@ -61,23 +63,30 @@ class OrcidImporter(FatcatImporter): return None ce = fatcat_client.CreatorEntity( orcid=orcid, - given_name=given, - surname=sur, - display_name=display, + given_name=clean(given), + surname=clean(sur), + display_name=clean(display), extra=extra) return ce - def create_row(self, row, editgroup_id=None): - obj = json.loads(row) - ce = self.parse_orcid_dict(obj) - if ce is not None: - self.api.create_creator(ce, editgroup_id=editgroup_id) - self.counts['insert'] += 1 + def try_update(self, raw_record): + existing = None + try: + existing = self.api.lookup_creator(orcid=raw_record.orcid) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_orcid_dict(json.loads(l)) - for l in batch if l != None] - objects = [o for o in objects if o != None] - self.api.create_creator_batch(objects, autoaccept="true") - self.counts['insert'] += len(objects) + def insert_batch(self, batch): + self.api.create_creator_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index 0f957f9a..2493b1ab 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -1,4 +1,5 @@ + import collections from fatcat_client import ReleaseEntity, ApiClient @@ -26,25 +27,43 @@ def release_to_elasticsearch(release): Raises exception on error (never returns None) """ - if release.state != 'active': - raise ValueError("Entity is not 'active'") + if release.state in ('redirect', 'deleted'): + return dict( + ident = release.ident, + state = release.state, + ) + elif release.state != 'active': + raise ValueError("Unhandled release state: {}".format(release.state)) # First, the easy ones (direct copy) t = dict( ident = release.ident, + state = release.state, revision = release.revision, title = release.title, + original_title = release.original_title, release_type = release.release_type, release_status = release.release_status, language = release.language, + license = release.license_slug, doi = release.doi, pmid = release.pmid, pmcid = release.pmcid, isbn13 = release.isbn13, + wikidata_qid = release.wikidata_qid, core_id = release.core_id, - wikidata_qid = release.wikidata_qid + arxiv_id = release.core_id, + jstor_id = release.jstor_id, ) + is_oa = None + is_longtail_oa = None + in_kbart = None + in_web = False + in_dweb = False + in_ia = False + in_shadow = False + if release.release_date: # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) t['release_date'] = release.release_date.isoformat() @@ -53,52 +72,99 @@ def release_to_elasticsearch(release): if release.release_year is not None: t['release_year'] = release.release_year + t['any_abstract'] = len(release.abstracts) > 0 + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + container = release.container - container_is_kept = False if container: t['publisher'] = container.publisher t['container_name'] = container.name t['container_issnl'] = container.issnl - container_extra = container.extra - if container_extra: - t['container_is_oa'] = container_extra.get('is_oa') - container_is_kept = container_extra.get('is_kept', False) - t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + t['container_type'] = container.container_type + if container.extra: + if container.extra.get('is_oa') or container.extra.get('in_doaj'): + is_oa = True + if container.extra.get('in_kbart'): + # TODO: better KBART check goes here + in_kbart = True + if container.extra.get('ia'): + # TODO: container longtail check goes here + # TODO: sim/microfilm check goes here + pass + # TODO: SHERPA/Romeo goes here else: t['publisher'] = release.publisher files = release.files or [] t['file_count'] = len(files) - in_wa = False - in_ia = False - t['file_pdf_url'] = None + t['fileset_count'] = len(release.filesets or []) + t['webcapture_count'] = len(release.webcaptures or []) + any_pdf_url = None + good_pdf_url = None + best_pdf_url = None + ia_pdf_url = None for f in files: + if f.extra and f.extra.get('shadows'): + # TODO: shadow check goes here + in_shadows = True is_pdf = 'pdf' in (f.mimetype or '') for url in (f.urls or []): - if url.rel == 'webarchive': - in_wa = True - if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''): + if url.url.lower().startswith('http'): + in_web = True + if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): + # TODO: not sure what rel will be + in_dweb = True + if is_pdf: + any_pdf_url = url.url + if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: + is_preserved = True + good_pdf_url = url.url + if '//web.archive.org/' in url.url or '//archive.org/' in url.url: in_ia = True if is_pdf: - t['file_pdf_url'] = url.url - if not t['file_pdf_url'] and is_pdf: - t['file_pdf_url'] = url.url - t['file_in_webarchive'] = in_wa - t['file_in_ia'] = in_ia + best_pdf_url = url.url + ia_pdf_url = url.url + # here is where we bake-in priority; IA-specific + t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url + t['ia_pdf_url'] = ia_pdf_url + + if release.license_slug: + # TODO: more/better checks here, particularly strict *not* OA licenses + if release.license_slug.startswith("CC-"): + is_oa = True extra = release.extra or dict() if extra: - t['in_shadow'] = extra.get('in_shadow') - if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): - t['container_is_longtail_oa'] = True - t['any_abstract'] = bool(release.abstracts) - t['is_kept'] = container_is_kept or extra.get('is_kept', False) + # TODO: longtail OA check from GROBID here + if extra.get('in_kbart'): + # NOTE: not actually setting this anywhere + in_kbart = True + if extra.get('is_oa'): + # NOTE: not actually setting this anywhere + is_oa = True + if extra.get('grobid'): + if not t.get('container_name'): + t['container_name'] = extra['grobid'].get('container_name') + if extra['grobid'].get('longtail_oa'): + is_longtail_oa = True + if extra.get('crossref'): + if extra['crossref'].get('archive'): + # all crossref archives are KBART, I believe + in_kbart = True - t['ref_count'] = len(release.refs or []) - t['contrib_count'] = len(release.contribs or []) - contrib_names = [] - for c in (release.contribs or []): - if c.raw_name: - contrib_names.append(c.raw_name) - t['contrib_names'] = contrib_names + if is_longtail_oa: + is_oa = True + t['is_oa'] = is_oa + t['is_longtail_oa'] = is_longtail_oa + t['in_kbart'] = in_kbart + t['in_web'] = in_web + t['in_dweb'] = in_dweb + t['in_ia'] = in_ia + t['is_preserved'] = in_ia or in_kbart return t diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 8690a791..636ed304 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -93,7 +93,7 @@ class EntityUpdatesWorker(FatcatWorker): release_edits = cle['editgroup']['edits']['releases'] for re in release_edits: ident = re['ident'] - release = self.api.get_release(ident, expand="files,container") + release = self.api.get_release(ident, expand="files,filesets,webcaptures,container") release_dict = self.api.api_client.sanitize_for_serialization(release) producer.produce( message=json.dumps(release_dict).encode('utf-8'), diff --git a/python/fatcat_web/auth.py b/python/fatcat_web/auth.py index 8035cbe5..03964c92 100644 --- a/python/fatcat_web/auth.py +++ b/python/fatcat_web/auth.py @@ -90,7 +90,10 @@ def handle_ia_xauth(email, password): 'secret': Config.IA_XAUTH_CLIENT_SECRET, }) if resp.status_code == 401 or (not resp.json().get('success')): - flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason'])) + try: + flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason'])) + except: + print("IA XAuth fail: {}".format(resp.content)) return render_template('auth_ia_login.html', email=email), resp.status_code elif resp.status_code != 200: flash("Internet Archive login failed (internal error?)") diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index a5927d9b..926d5340 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -4,7 +4,7 @@ import json from flask import Flask, render_template, send_from_directory, request, \ url_for, abort, g, redirect, jsonify, session, flash from flask_login import login_required -from fatcat_web import app, api, auth_api +from fatcat_web import app, api, auth_api, priv_api from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth from fatcat_client.rest import ApiException from fatcat_web.search import do_search @@ -368,6 +368,8 @@ def search(): @app.route('/auth/login') def login(): # show the user a list of login options + if not priv_api: + flash("This web interface not configured with credentials to actually allow login (other than via token)") return render_template('auth_login.html') @app.route('/auth/ia/login', methods=['GET', 'POST']) diff --git a/python/fatcat_web/templates/container_view.html b/python/fatcat_web/templates/container_view.html index 29f0b9d9..4a175a5d 100644 --- a/python/fatcat_web/templates/container_view.html +++ b/python/fatcat_web/templates/container_view.html @@ -15,12 +15,6 @@ <p><b>Publisher:</b> {% if container.publisher != None %}{{ container.publisher }}{% else %}<i>Unknown</i>{% endif %} -{% if container.coden != None %} -<br><b>CODEN<sup><a href="https://en.wikipedia.org/wiki/CODEN">?</a></sup>:</b> <code>{{ container.coden }}</code> -{% endif %} -{% if container.abbrev != None %} -<br><b>Abbrev.:</b> <code>{{ container.abbrev }}</code> -{% endif %} {% if (container.extra != None) and (container.extra['url'] != None) and (container.extra['url']|length > 0) %} <br><b>Homepage:</b> <a href="{{ container.extra['url'] }}"> <code>{{ container.extra['url'] }}</code></a> {% endif %} diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index fd86b7c9..4e24b281 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -143,7 +143,7 @@ Raw Object: {% endif %} <br> -{% if release.refs.size != 0 %} +{% if release.refs != None and release.refs.size != 0 %} <h3>References</h3> This release citing other releases. <ol> diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py index cbe519b0..9ce32ed7 100644 --- a/python/fatcat_web/web_config.py +++ b/python/fatcat_web/web_config.py @@ -19,7 +19,7 @@ class Config(object): GIT_REVISION = subprocess.check_output(["git", "describe", "--always"]).strip().decode('utf-8') # This is, effectively, the QA/PROD flag - FATCAT_DOMAIN = os.environ.get("FATCAT_DOMAIN", default="qa.fatcat.wiki") + FATCAT_DOMAIN = os.environ.get("FATCAT_DOMAIN", default="dev.fatcat.wiki") FATCAT_API_AUTH_TOKEN = os.environ.get("FATCAT_API_AUTH_TOKEN", default=None) FATCAT_API_HOST = os.environ.get("FATCAT_API_HOST", default="https://{}/v0".format(FATCAT_DOMAIN)) @@ -39,10 +39,11 @@ class Config(object): IA_XAUTH_CLIENT_SECRET = os.environ.get("IA_XAUTH_CLIENT_SECRET", default=None) # protect cookies (which include API tokens) - SESSION_COOKIE_HTTPONLY = True - SESSION_COOKIE_SECURE = True - SESSION_COOKIE_SAMESITE = 'Lax' - PERMANENT_SESSION_LIFETIME = 2678400 # 31 days, in seconds + if FATCAT_DOMAIN != "dev.fatcat.wiki": + SESSION_COOKIE_HTTPONLY = True + SESSION_COOKIE_SECURE = True + SESSION_COOKIE_SAMESITE = 'Lax' + PERMANENT_SESSION_LIFETIME = 2678400 # 31 days, in seconds try: GIT_RELEASE = raven.fetch_git_sha('..') diff --git a/python/tests/api_annotations.py b/python/tests/api_annotations.py new file mode 100644 index 00000000..0d3c5046 --- /dev/null +++ b/python/tests/api_annotations.py @@ -0,0 +1,39 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_annotations(api): + + eg = quick_eg(api) + + # ensure no annotations on this object + a = api.get_editgroup_annotations(eg.editgroup_id) + assert a == [] + + # create an annotation! + api.create_editgroup_annotation( + eg.editgroup_id, + EditgroupAnnotation( + comment_markdown="some *annotation*", + extra=dict(thing="thang"))) + + # check that we can fetch it all sorts of ways + a = api.get_editgroup_annotations(eg.editgroup_id) + assert len(a) == 1 + assert a[0].extra['thing'] == "thang" + + # the editor persists, so this is a hack to find a "recent" one + a2 = api.get_editor_annotations(eg.editor_id, limit=100) + found = None + for thing in a2: + if thing.annotation_id == a[0].annotation_id: + found = thing + break + assert thing + assert thing.extra['thing'] == "thang" diff --git a/python/tests/api_containers.py b/python/tests/api_containers.py new file mode 100644 index 00000000..674ae3b8 --- /dev/null +++ b/python/tests/api_containers.py @@ -0,0 +1,48 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_container(api): + eg = quick_eg(api) + + # all the fields! + c1 = ContainerEntity( + name="some container name", + container_type="journal", + publisher="some container publisher", + issnl="1234-567X", + wikidata_qid="Q954248", + extra=dict(a=1, b=2), + ) + + c1edit = api.create_container(c1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + c2 = api.get_container(c1edit.ident) + + # check that fields match + assert c1.name == c2.name + assert c1.container_type == c2.container_type + assert c1.publisher == c2.publisher + assert c1.issnl == c2.issnl + assert c1.wikidata_qid == c2.wikidata_qid + assert c1.extra == c2.extra + + # expansion + # TODO: via release + # lookup + # TODO: via issnl; but need to generate random identifiers + +def test_container_examples(api): + + api.lookup_container(issnl='1549-1277') + + c1 = api.get_container('aaaaaaaaaaaaaeiraaaaaaaaam') + assert c1.name == "PLOS Medicine" + assert c1.issnl == "1549-1277" + diff --git a/python/tests/api_creators.py b/python/tests/api_creators.py new file mode 100644 index 00000000..7443675b --- /dev/null +++ b/python/tests/api_creators.py @@ -0,0 +1,44 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_creators(api): + eg = quick_eg(api) + + # all the fields! + c1 = CreatorEntity( + display_name="Emma Smith", + given_name="emma", + surname="smith", + orcid="0000-0002-1825-0097", + wikidata_qid="Q9542248", + extra=dict(a=1, b=5), + ) + + c1edit = api.create_creator(c1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + c2 = api.get_creator(c1edit.ident) + + # check that fields match + assert c1.display_name == c2.display_name + assert c1.given_name == c2.given_name + assert c1.surname == c2.surname + assert c1.orcid == c2.orcid + assert c1.wikidata_qid == c2.wikidata_qid + assert c1.extra == c2.extra + + # expansion + # TODO: via release + # lookup + # TODO: via issnl; but need to generate random identifiers + +def test_creators_examples(api): + # TODO: aaaaaaaaaaaaaircaaaaaaaaam + + api.lookup_creator(orcid='0000-0003-3118-6859') diff --git a/python/tests/api_editgroups.py b/python/tests/api_editgroups.py new file mode 100644 index 00000000..722d8686 --- /dev/null +++ b/python/tests/api_editgroups.py @@ -0,0 +1,140 @@ + +import json +import pytest +import datetime +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_editgroup_submit(api): + # 1. check that edit group can be submitted/unsubmitted, and shows up in reviewable appropriately + # 2. accepted edits don't show up as reviewable and can't be submitted + + c1 = CreatorEntity(display_name="test updates") + eg = quick_eg(api) + c1 = api.get_creator(api.create_creator(c1, editgroup_id=eg.editgroup_id).ident) + + eg2 = api.get_editgroup(eg.editgroup_id) + assert not eg2.submitted + assert not eg2.changelog_index + + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id not in [v.editgroup_id for v in reviewable] + wip = api.get_editor_editgroups(eg.editor_id, limit=100) + assert eg.editgroup_id in [v.editgroup_id for v in wip] + + api.update_editgroup(eg.editgroup_id, eg2, submit=True) + eg3 = api.get_editgroup(eg.editgroup_id) + assert eg3.submitted + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id in [v.editgroup_id for v in reviewable] + + api.update_editgroup(eg.editgroup_id, eg2, submit=False) + eg3 = api.get_editgroup(eg.editgroup_id) + assert not eg3.submitted + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id not in [v.editgroup_id for v in reviewable] + + # put back in reviewable + api.update_editgroup(eg.editgroup_id, eg2, submit=True) + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id in [v.editgroup_id for v in reviewable] + + # shouldn't be reviewable if accepted + api.accept_editgroup(eg.editgroup_id) + reviewable = api.get_editgroups_reviewable(limit=100) + assert eg.editgroup_id not in [v.editgroup_id for v in reviewable] + eg3 = api.get_editgroup(eg.editgroup_id) + #print(eg3) + assert eg3.submitted + assert eg3.changelog_index + + with pytest.raises(fatcat_client.rest.ApiException): + api.update_editgroup(eg.editgroup_id, eg3, submit=True) + with pytest.raises(fatcat_client.rest.ApiException): + eg3.description = "something" + api.update_editgroup(eg.editgroup_id, eg3) + + +def test_editgroup_ordering(api): + + eg1 = quick_eg(api) + eg2 = quick_eg(api) + api.update_editgroup( + eg1.editgroup_id, + Editgroup(editgroup_id=eg1.editgroup_id, description="FAIL"), + submit=True) + api.update_editgroup( + eg2.editgroup_id, + Editgroup(editgroup_id=eg2.editgroup_id, description="FAIL"), + submit=True) + + r1 = api.get_editgroups_reviewable() + #print(r1) + assert not r1[0].description + assert not r1[1].description + assert r1[0].submitted >= r1[1].submitted + + # should be no editgroups "in the future" (since now + 1sec) + r1 = api.get_editgroups_reviewable(since=(datetime.datetime.utcnow() + datetime.timedelta(seconds=1)).isoformat()+"Z") + assert not r1 + + r1 = api.get_editgroups_reviewable(since=(datetime.datetime.utcnow() - datetime.timedelta(seconds=5)).isoformat()+"Z") + assert r1[0].submitted <= r1[1].submitted + + +def test_editgroup_autoaccept(api): + # autoaccept changes: editgroups required when, in what combination + + eg = quick_eg(api) + c1 = CreatorEntity(display_name="test autoaccept") + c2 = CreatorEntity(display_name="test another autoaccept") + + with pytest.raises(fatcat_client.rest.ApiException): + edits = api.create_creator_batch([c1, c2]) + + with pytest.raises(fatcat_client.rest.ApiException): + edits = api.create_creator_batch([c1, c2], editgroup_id=eg.editgroup_id, autoaccept=True) + + edits1 = api.create_creator_batch([c1, c2], editgroup_id=eg.editgroup_id) + edits2 = api.create_creator_batch([c1, c2], autoaccept=True) + + assert edits1[0].editgroup_id == eg.editgroup_id + assert edits1[0].editgroup_id != edits2[1].editgroup_id + eg1 = api.get_editgroup(edits1[0].editgroup_id) + eg2 = api.get_editgroup(edits2[0].editgroup_id) + + assert not eg1.changelog_index + assert eg2.changelog_index + #print(edits1) + #print(eg1.edits.creators) + assert eg1.edits.creators[0].ident in [t.ident for t in edits1] + assert eg2.edits.creators[0].ident in [t.ident for t in edits2] + + +def test_batch_params(api): + + eg = quick_eg(api) + c1 = CreatorEntity(display_name="test autoaccept") + c2 = CreatorEntity(display_name="test another autoaccept") + + with pytest.raises(fatcat_client.rest.ApiException): + edits = api.create_creator_batch([c1, c2]) + + desc = "test description" + extra = dict(a=75, q="thing") + edits = api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra=json.dumps(extra)) + eg = api.get_editgroup(edits[0].editgroup_id) + + assert eg.description == desc + assert eg.extra == extra + + # currently must manually json dumps() extra field + with pytest.raises(fatcat_client.rest.ApiException): + api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra=extra) + + with pytest.raises(fatcat_client.rest.ApiException): + api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra="{") diff --git a/python/tests/api_files.py b/python/tests/api_files.py new file mode 100644 index 00000000..033538ef --- /dev/null +++ b/python/tests/api_files.py @@ -0,0 +1,52 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_file(api): + + eg = quick_eg(api) + + # all the fields! + f1 = FileEntity( + size=89238, + md5="7ce6615b2a5904939576d9567bd5f68e", + sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2", + sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3", + mimetype="application/pdf", + extra=dict(a=2, b=5), + urls=[ + FileEntityUrls(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"), + ], + release_ids=[], + ) + + f1edit = api.create_file(f1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + f2 = api.get_file(f1edit.ident) + + # check that fields match + assert f1.size == f2.size + assert f1.md5 == f2.md5 + assert f1.sha1 == f2.sha1 + assert f1.sha256 == f2.sha256 + assert f1.mimetype == f2.mimetype + assert f1.extra == f2.extra + assert f1.urls == f2.urls + assert f1.release_ids == f2.release_ids + + # expansion + # TODO: via release + # lookup + # TODO: via hashes; but need to generate random? + +def test_file_examples(api): + + api.lookup_file(sha256='ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362') + + f1 = api.get_file('aaaaaaaaaaaaamztaaaaaaaaam') diff --git a/python/tests/api_filesets.py b/python/tests/api_filesets.py new file mode 100644 index 00000000..966b85ca --- /dev/null +++ b/python/tests/api_filesets.py @@ -0,0 +1,79 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_fileset(api): + + eg = quick_eg(api) + r1 = ReleaseEntity(title="test fileset release") + r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id) + + fs1 = FilesetEntity( + manifest = [ + FilesetEntityManifest( + path="data/thing.tar.gz", + size=54321, + md5="540da3ea6e448d8dfb057c05225f853a", + sha1="1dab6a0e110f9b5d70b18db0abf051f7f93faf06", + sha256="c7b49f3e84cd1b7cb0b0e3e9f632b7be7e21b4dc229df23331f880a8a7dfa75a", + extra={"a": 1, "b": 3}, + ), + FilesetEntityManifest( + path="README.md", + size=54210, + md5="5f83592b5249671719bbed6ce91ecfa8", + sha1="455face3598611458efe1f072e58624790a67266", + sha256="429bcafa4d3d0072d5b2511e12c85c1aac1d304011d1c406da14707f7b9cd905", + extra={"x": 1, "y": "q"}, + ), + ], + urls = [ + FileEntityUrls(url="https://archive.org/download/fileset-123/", rel="repository"), + FileEntityUrls(url="https://humble-host.com/~user123/dataset/", rel="web"), + ], + release_ids = [r1edit.ident], + ) + + fs1edit = api.create_fileset(fs1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + fs2 = api.get_fileset(fs1edit.ident) + + # check that fields match + assert fs1.urls == fs2.urls + assert fs1.manifest == fs2.manifest + assert fs1.release_ids == fs2.release_ids + + # expansion + r1 = api.get_release(r1edit.ident, expand="filesets") + assert r1.filesets[0].manifest == fs1.manifest + +def test_fileset_examples(api): + fs3 = api.get_fileset('aaaaaaaaaaaaaztgaaaaaaaaam') + + assert fs3.urls[0].url == 'http://other-personal-blog.name/dataset/' + assert fs3.urls[1].rel == 'archive' + assert fs3.manifest[1].md5 == 'f4de91152c7ab9fdc2a128f962faebff' + assert fs3.manifest[1].extra['mimetype'] == 'application/gzip' + +def test_bad_fileset(api): + + eg = quick_eg(api) + + bad_list = [ + # good (for testing test itself) + #FilesetEntity(manifest=[FilesetEntityManifest(path="123.jpg", size=1234)]), + #FilesetEntity(urls=[FileEntityUrls(url="thing", rel="blah")]), + FilesetEntity(manifest=[FilesetEntityManifest(path="123.jpg", size="big")]), + FilesetEntity(release_ids=["asdf"]), + ] + + for b in bad_list: + with pytest.raises(fatcat_client.rest.ApiException): + api.create_fileset(b, editgroup_id=eg.editgroup_id) + diff --git a/python/tests/api_misc.py b/python/tests/api_misc.py index 3510ea82..0a0f16da 100644 --- a/python/tests/api_misc.py +++ b/python/tests/api_misc.py @@ -8,14 +8,6 @@ from fatcat_client.rest import ApiException from fixtures import * -def test_lookups(api): - - api.lookup_creator(orcid='0000-0003-3118-6859') - api.lookup_container(issnl='1549-1277') - api.lookup_file(sha256='ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362') - api.lookup_release(pmid='54321') - api.lookup_release(isbn13='978-3-16-148410-0') - def test_lookup_hide_extend(api): r = api.lookup_release(doi='10.1371/journal.pmed.0020124') diff --git a/python/tests/api_releases.py b/python/tests/api_releases.py new file mode 100644 index 00000000..ed6f24a4 --- /dev/null +++ b/python/tests/api_releases.py @@ -0,0 +1,103 @@ + +import json +import pytest +import datetime +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_release(api): + + eg = quick_eg(api) + + # all the fields! + r1 = ReleaseEntity( + title="some title", + original_title="оригинальное название", + release_type="post-weblog", + release_status="pre-print", + release_date=datetime.datetime.utcnow().date(), + release_year=2015, + doi="10.5555/12345678", + pmid="12345", + pmcid="PMC4321", + wikidata_qid="Q1234", + isbn13="978-3-16-148410-0", + core_id="187348", + arxiv_id="aslkdjfh", + jstor_id="8328424", + volume="84", + issue="XII", + pages="4-99", + publisher="some publisher", + language="en", + license_slug="CC-0", + extra=dict(a=1, b=2), + contribs=[], + refs=[], + abstracts=[ + ReleaseEntityAbstracts( + content="this is some abstract", + mimetype="text/plain", + lang="en"), + ReleaseEntityAbstracts( + content="this is some other abstract", + mimetype="text/plain", + lang="de"), + ], + ) + + r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r2 = api.get_release(r1edit.ident) + + # check that fields match + assert r1.title == r2.title + assert r1.original_title == r2.original_title + assert r1.release_type == r2.release_type + assert r1.release_date == r2.release_date + assert r1.release_year == r2.release_year + assert r1.doi == r2.doi + assert r1.pmid == r2.pmid + assert r1.pmcid == r2.pmcid + assert r1.wikidata_qid == r2.wikidata_qid + assert r1.isbn13 == r2.isbn13 + assert r1.core_id == r2.core_id + assert r1.arxiv_id == r2.arxiv_id + assert r1.jstor_id == r2.jstor_id + assert r1.volume == r2.volume + assert r1.issue == r2.issue + assert r1.pages == r2.pages + assert r1.publisher == r2.publisher + assert r1.language == r2.language + assert r1.license_slug == r2.license_slug + assert r1.extra == r2.extra + + for i in range(len(r1.abstracts)): + r1.abstracts[i].content == r2.abstracts[i].content + r1.abstracts[i].mimetype == r2.abstracts[i].mimetype + r1.abstracts[i].lang == r2.abstracts[i].lang + for i in range(len(r1.contribs)): + r1.contribs[i] == r2.contribs[i] + for i in range(len(r1.refs)): + r1.refs[i] == r2.refs[i] + + # expansion + # TODO: via work + # lookup + # TODO: via all; but need to generate random identifiers + +def test_release_examples(api): + + api.lookup_release(pmid='54321') + api.lookup_release(isbn13='978-3-16-148410-0') + + r1 = api.get_release('aaaaaaaaaaaaarceaaaaaaaaai') + assert r1.title == "bigger example" + assert len(r1.refs) == 5 + assert r1.contribs[0].role == "editor" + assert r1.abstracts[0].mimetype == "application/xml+jats" + diff --git a/python/tests/api_webcaptures.py b/python/tests/api_webcaptures.py new file mode 100644 index 00000000..dc1754b3 --- /dev/null +++ b/python/tests/api_webcaptures.py @@ -0,0 +1,96 @@ + +import json +import pytest +import datetime +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_webcapture(api): + + eg = quick_eg(api) + r1 = ReleaseEntity(title="test webcapture release") + r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id) + + wc1 = WebcaptureEntity( + original_url = "http://example.site", + #timestamp = "2012-01-02T03:04:05Z", + timestamp = datetime.datetime.now(datetime.timezone.utc), + cdx = [ + WebcaptureEntityCdx( + surt="site,example,)/data/thing.tar.gz", + #timestamp="2012-01-02T03:04:05Z", + timestamp=datetime.datetime.now(datetime.timezone.utc), + url="http://example.site/data/thing.tar.gz", + mimetype="application/gzip", + status_code=200, + sha1="455face3598611458efe1f072e58624790a67266", + sha256="c7b49f3e84cd1b7cb0b0e3e9f632b7be7e21b4dc229df23331f880a8a7dfa75a", + ), + WebcaptureEntityCdx( + surt="site,example,)/README.md", + #timestamp="2012-01-02T03:04:05Z", + timestamp=datetime.datetime.now(datetime.timezone.utc), + url="http://example.site/README.md", + mimetype="text/markdown", + status_code=200, + sha1="455face3598611458efe1f072e58624790a67266", + sha256="429bcafa4d3d0072d5b2511e12c85c1aac1d304011d1c406da14707f7b9cd905", + ), + ], + archive_urls = [ + FileEntityUrls(rel="wayback", url="https://web.archive.org/web/"), + ], + release_ids = [r1edit.ident], + ) + + wc1edit = api.create_webcapture(wc1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + wc2 = api.get_webcapture(wc1edit.ident) + + # check that fields match + # I don't know why these aren't equal... + #print(wc1.archive_urls) + #print(wc2.archive_urls) + #assert wc1.archive_urls == wc2.archive_urls + assert wc1.archive_urls[0].rel == wc2.archive_urls[0].rel + assert wc1.archive_urls[0].url == wc2.archive_urls[0].url + assert wc1.cdx == wc2.cdx + assert wc1.release_ids == wc2.release_ids + assert wc1.timestamp == wc2.timestamp + assert wc1.original_url == wc2.original_url + + # TODO: check release expansion + r1 = api.get_release(r1edit.ident, expand="webcaptures") + print(r1) + assert r1.webcaptures[0].cdx == wc1.cdx + +def test_webcapture_examples(api): + wc3 = api.get_webcapture('aaaaaaaaaaaaa53xaaaaaaaaam') + + assert wc3.cdx[0].surt == 'org,asheesh)/' + assert wc3.cdx[1].sha1 == 'a637f1d27d9bcb237310ed29f19c07e1c8cf0aa5' + assert wc3.archive_urls[1].rel == 'warc' + + +def test_bad_webcapture(api): + + eg = quick_eg(api) + + bad_list = [ + # good (for testing test itself) + WebcaptureEntity(cdx=[ + WebcaptureEntityCdx( + surt="site,example,)/123.jpg", + url="http://example.site/123.jpg", + sha1="455face3598611458efe1f072e58624790a67266", + timestamp=201506071122)]), + ] + + for b in bad_list: + with pytest.raises(fatcat_client.rest.ApiException): + api.create_webcapture(b, editgroup_id=eg.editgroup_id) + diff --git a/python/tests/citation_efficiency.py b/python/tests/citation_efficiency.py new file mode 100644 index 00000000..fe5006cc --- /dev/null +++ b/python/tests/citation_efficiency.py @@ -0,0 +1,113 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_citation_indexing(api): + # indexing is consistent and reacts to change + + eg = quick_eg(api) + r1 = ReleaseEntity(title="the target") + r1.refs = [ + ReleaseRef(key="first", title="the first title"), + ReleaseRef(key="second", title="the second title"), + ReleaseRef(key="third", title="a third title"), + ] + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + assert r1.refs[0].index == 0 + assert r1.refs[0].key == "first" + assert r1.refs[1].index == 1 + assert r1.refs[1].key == "second" + assert r1.refs[2].index == 2 + assert r1.refs[2].key == "third" + + r1.refs.pop(1) + eg = quick_eg(api) + api.update_release(r1.ident, r1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r1 = api.get_release(r1.ident) + + assert r1.refs[0].index == 0 + assert r1.refs[0].key == "first" + assert r1.refs[1].index == 1 + assert r1.refs[1].key == "third" + +def test_citation_targets(api): + # invariant to linking citations + # also, updates work + + eg = quick_eg(api) + r1 = ReleaseEntity(title="the target") + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + r2 = ReleaseEntity(title="the citer") + r2.refs = [ + ReleaseRef(key="first", title="something else"), + ReleaseRef(key="second", title="the target title"), + ] + r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + eg = quick_eg(api) + r2.refs[1].target_release_id = r1.ident + api.update_release(r2.ident, r2, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r2 = api.get_release(r2.ident) + assert r2.refs[0].key == "first" + assert r2.refs[1].key == "second" + assert r2.refs[0].index == 0 # TODO: one-indexing? + assert r2.refs[1].index == 1 + assert r2.refs[0].target_release_id == None + assert r2.refs[1].target_release_id == r1.ident + assert len(r2.refs) == 2 + +def test_citation_empty_array(api): + # distinction between empty array (no citations) and no array (hidden) + + r1 = ReleaseEntity(title="citation null") + r2 = ReleaseEntity(title="citation empty array") + r1.refs = None + r2.refs = [] + + eg = quick_eg(api) + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + print(r1.refs) + print(r2.refs) + assert r1.refs == [] + assert r1.refs == r2.refs + + r1b = api.get_release(r1.ident, hide="refs") + assert r1b.refs == None + +def test_citation_encoding(api): + # escape-only changes (eg, \u1234 whatever for ASCII) + + r1 = ReleaseEntity(title="citation encoding") + title = "title-unicode \\u0050 \\\" " + container = "container-unicode ☃︎ ä ö ü スティー" + extra = extra={'a': 1, 'b': 2, 'ö': 3} + locator = "p123" + r1.refs = [ + ReleaseRef(key="1", year=1923, title=title, container_name=container, + extra=extra, locator=locator), + ReleaseRef(key="2"), + ] + + eg = quick_eg(api) + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + assert title == r1.refs[0].title + assert container == r1.refs[0].container_name + assert extra == r1.refs[0].extra + assert locator == r1.refs[0].locator + diff --git a/python/tests/cli.sh b/python/tests/cli.sh index eba6d3a7..19d8a85b 100755 --- a/python/tests/cli.sh +++ b/python/tests/cli.sh @@ -14,7 +14,7 @@ set -x ./fatcat_import.py crossref tests/files/crossref-works.2018-01-21.badsample.json tests/files/ISSN-to-ISSN-L.snip.txt ./fatcat_import.py orcid tests/files/0000-0001-8254-7103.json -./fatcat_import.py issn tests/files/journal_extra_metadata.snip.csv +./fatcat_import.py journal-metadata tests/files/journal_extra_metadata.snip.csv ./fatcat_import.py matched tests/files/matched_sample.json ./fatcat_import.py matched tests/files/example_matched.json ./fatcat_import.py grobid-metadata tests/files/example_grobid_metadata_lines.tsv diff --git a/python/tests/files/crossref-works.single.json b/python/tests/files/crossref-works.single.json index 2af2b358..e3d2e05c 100644 --- a/python/tests/files/crossref-works.single.json +++ b/python/tests/files/crossref-works.single.json @@ -84,7 +84,7 @@ { "given": "Carlos G.", "family": "Diaz", - "affiliation": ["Some University"] + "affiliation": [{"name": "Some University"}, {"name": "Some Department"}] }, { "given": "Francisco M.", diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index 6a880c48..3cc275b3 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -28,6 +28,7 @@ def api(): conf.api_key["Authorization"] = os.getenv("FATCAT_API_AUTH_TOKEN") conf.api_key_prefix["Authorization"] = "Bearer" api_client = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + api_client.editor_id = "aaaaaaaaaaaabkvkaaaaaaaaae" return api_client def test_get_changelog_entry(api): @@ -38,33 +39,6 @@ def test_get_changelog_entry(api): ## Helpers ################################################################## def quick_eg(api_inst): - eg = api_inst.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + eg = api_inst.create_editgroup(fatcat_client.Editgroup()) return eg -# TODO: what are these even here for? -def check_entity_fields(e): - for key in ('rev', 'is_live', 'redirect_id'): - assert key in e - for key in ('id',): - assert e[key] is not None - -def check_release(e): - for key in ('work', 'release_type'): - assert key in e - for key in ('title', ): - assert e[key] is not None - for key in ('refs', 'creators'): - assert type(e[key]) == list - -def check_creator(e): - for key in ('name',): - assert e[key] is not None - -def check_container(e): - for key in ('name',): - assert e[key] is not None - -def check_file(e): - for key in ('size', 'sha1'): - assert e[key] is not None diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index e2ca6122..193f78f6 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -1,35 +1,51 @@ import json import pytest -from fatcat_tools.importers import CrossrefImporter +from fatcat_tools.importers import CrossrefImporter, JsonLinePusher from fixtures import api @pytest.fixture(scope="function") def crossref_importer(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', check_existing=False) + yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True) @pytest.fixture(scope="function") def crossref_importer_existing(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', check_existing=True) + yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) def test_crossref_importer_batch(crossref_importer): with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - crossref_importer.process_batch(f) + JsonLinePusher(crossref_importer, f).run() def test_crossref_importer(crossref_importer): + last_index = crossref_importer.api.get_changelog(limit=1)[0].index with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - crossref_importer.process_source(f) + crossref_importer.bezerk_mode = True + counts = JsonLinePusher(crossref_importer, f).run() + assert counts['insert'] == 14 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + # fetch most recent editgroup - changes = crossref_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup + change = crossref_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup assert eg.description assert "crossref" in eg.description.lower() assert eg.extra['git_rev'] assert "fatcat_tools.CrossrefImporter" in eg.extra['agent'] + last_index = crossref_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: + crossref_importer.bezerk_mode = False + crossref_importer.reset() + counts = JsonLinePusher(crossref_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 14 + assert counts['skip'] == 0 + assert last_index == crossref_importer.api.get_changelog(limit=1)[0].index + def test_crossref_mappings(crossref_importer): assert crossref_importer.map_release_type('journal-article') == "article-journal" assert crossref_importer.map_release_type('asdf') is None @@ -39,13 +55,13 @@ def test_crossref_mappings(crossref_importer): def test_crossref_importer_create(crossref_importer): crossref_importer.create_containers = True with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - crossref_importer.process_source(f) + JsonLinePusher(crossref_importer, f).run() def test_crossref_dict_parse(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: # not a single line raw = json.loads(f.read()) - (r, c) = crossref_importer.parse_crossref_dict(raw) + r = crossref_importer.parse_record(raw) extra = r.extra['crossref'] assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators" assert r.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t" @@ -61,7 +77,8 @@ def test_crossref_dict_parse(crossref_importer): assert len(r.contribs) == 5 assert r.contribs[0].raw_name == "Marcelo D. Radicioni" assert r.contribs[0].index == 0 - assert r.contribs[1].extra['affiliations'] == ["Some University"] + assert r.contribs[1].raw_affiliation == "Some University" + assert r.contribs[1].extra['more_affiliations'] == ["Some Department"] assert r.contribs[1].role == "author" assert r.contribs[3].role == "editor" assert r.contribs[3].index is None @@ -78,8 +95,10 @@ def test_crossref_dict_parse(crossref_importer): def test_stateful_checking(crossref_importer_existing): with open('tests/files/crossref-works.single.json', 'r') as f: # not a single line, a whole document - raw = json.loads(f.read()) + raw = f.read() # might not exist yet... - crossref_importer_existing.process_source([json.dumps(raw)]) - # ok, make sure we get 'None' back - assert crossref_importer_existing.parse_crossref_dict(raw) is None + crossref_importer_existing.push_record(json.loads(raw)) + crossref_importer_existing.finish() + # make sure we wouldn't insert again + entity = crossref_importer_existing.parse_record(json.loads(raw)) + assert crossref_importer_existing.try_update(entity) is False diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py index 97ebcaef..4fed4aaa 100644 --- a/python/tests/import_grobid_metadata.py +++ b/python/tests/import_grobid_metadata.py @@ -3,7 +3,7 @@ import os import json import base64 import pytest -from fatcat_tools.importers import GrobidMetadataImporter +from fatcat_tools.importers import GrobidMetadataImporter, LinePusher from fixtures import api """ @@ -15,10 +15,6 @@ side-effects. Should probably be disabled or re-written. def grobid_metadata_importer(api): yield GrobidMetadataImporter(api) -# TODO: use API to check that entities actually created... -#def test_grobid_metadata_importer_batch(grobid_metadata_importer): -# with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f: -# grobid_metadata_importer.process_batch(f) def test_grobid_metadata_parse(grobid_metadata_importer): with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f: @@ -30,7 +26,8 @@ def test_grobid_metadata_parse(grobid_metadata_importer): print(re.contribs) assert re.contribs[0].raw_name == "Wahyu Ary" assert re.publisher == None - assert re.extra.get('container_name') == None + if re.extra: + assert re.extra.get('container_name') == None assert len(re.refs) == 27 def test_file_metadata_parse(grobid_metadata_importer): @@ -53,13 +50,28 @@ def test_file_metadata_parse(grobid_metadata_importer): assert len(fe.release_ids) == 0 def test_grobid_metadata_importer(grobid_metadata_importer): + last_index = grobid_metadata_importer.api.get_changelog(limit=1)[0].index with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f: - grobid_metadata_importer.process_source(f) + grobid_metadata_importer.bezerk_mode = True + counts = LinePusher(grobid_metadata_importer, f).run() + assert counts['insert'] == 10 + assert counts['inserted.release'] == 10 + assert counts['exists'] == 0 + assert counts['skip'] == 0 # fetch most recent editgroup - changes = grobid_metadata_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup + change = grobid_metadata_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup assert eg.description assert "grobid" in eg.description.lower() assert eg.extra['git_rev'] assert "fatcat_tools.GrobidMetadataImporter" in eg.extra['agent'] + + with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f: + grobid_metadata_importer.reset() + grobid_metadata_importer.bezerk_mode = False + counts = LinePusher(grobid_metadata_importer, f).run() + assert counts['insert'] == 0 + assert counts['inserted.release'] == 0 + assert counts['exists'] == 10 + assert counts['skip'] == 0 diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py deleted file mode 100644 index 6b5978d9..00000000 --- a/python/tests/import_issn.py +++ /dev/null @@ -1,26 +0,0 @@ - -import pytest -from fatcat_tools.importers import IssnImporter -from fixtures import api - - -@pytest.fixture(scope="function") -def issn_importer(api): - yield IssnImporter(api) - -# TODO: use API to check that entities actually created... -def test_issn_importer_batch(issn_importer): - with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: - issn_importer.process_csv_batch(f) - -def test_issn_importer(issn_importer): - with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: - issn_importer.process_csv_source(f) - - # fetch most recent editgroup - changes = issn_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup - assert eg.description - assert "container" in eg.description.lower() - assert eg.extra['git_rev'] - assert "fatcat_tools.IssnImporter" in eg.extra['agent'] diff --git a/python/tests/import_journal_metadata.py b/python/tests/import_journal_metadata.py new file mode 100644 index 00000000..a2b10a65 --- /dev/null +++ b/python/tests/import_journal_metadata.py @@ -0,0 +1,39 @@ + +import pytest +from fatcat_tools.importers import JournalMetadataImporter, CsvPusher +from fixtures import api + + +@pytest.fixture(scope="function") +def journal_metadata_importer(api): + yield JournalMetadataImporter(api) + +# TODO: use API to check that entities actually created... +def test_journal_metadata_importer_batch(journal_metadata_importer): + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + CsvPusher(journal_metadata_importer, f).run() + +def test_journal_metadata_importer(journal_metadata_importer): + last_index = journal_metadata_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + journal_metadata_importer.bezerk_mode = True + counts = CsvPusher(journal_metadata_importer, f).run() + assert counts['insert'] == 9 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = journal_metadata_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "container" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.JournalMetadataImporter" in eg.extra['agent'] + + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + journal_metadata_importer.reset() + journal_metadata_importer.bezerk_mode = False + counts = CsvPusher(journal_metadata_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 9 + assert counts['skip'] == 0 diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py index 080674ac..8f694456 100644 --- a/python/tests/import_matched.py +++ b/python/tests/import_matched.py @@ -1,7 +1,7 @@ import json import pytest -from fatcat_tools.importers import MatchedImporter +from fatcat_tools.importers import MatchedImporter, JsonLinePusher from fixtures import api @@ -10,26 +10,40 @@ def matched_importer(api): yield MatchedImporter(api) # TODO: use API to check that entities actually created... -def test_matched_importer_batch(matched_importer): +def test_matched_importer(matched_importer): with open('tests/files/example_matched.json', 'r') as f: - matched_importer.process_batch(f) + JsonLinePusher(matched_importer, f).run() def test_matched_importer(matched_importer): + last_index = matched_importer.api.get_changelog(limit=1)[0].index with open('tests/files/example_matched.json', 'r') as f: - matched_importer.process_source(f) + matched_importer.bezerk_mode = True + counts = JsonLinePusher(matched_importer, f).run() + assert counts['insert'] == 2 + assert counts['exists'] == 0 + assert counts['skip'] == 11 # fetch most recent editgroup - changes = matched_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup + change = matched_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup assert eg.description assert "file-to-release" in eg.description.lower() assert eg.extra['git_rev'] assert "fatcat_tools.MatchedImporter" in eg.extra['agent'] + # re-insert; should skip + with open('tests/files/example_matched.json', 'r') as f: + matched_importer.reset() + matched_importer.bezerk_mode = False + counts = JsonLinePusher(matched_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 2 + assert counts['skip'] == 11 + def test_matched_dict_parse(matched_importer): with open('tests/files/example_matched.json', 'r') as f: raw = json.loads(f.readline()) - f = matched_importer.parse_matched_dict(raw) + f = matched_importer.parse_record(raw) assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313" assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff" assert f.mimetype == "application/pdf" diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py index 717a1328..57886b52 100644 --- a/python/tests/import_orcid.py +++ b/python/tests/import_orcid.py @@ -1,7 +1,7 @@ import json import pytest -from fatcat_tools.importers import OrcidImporter +from fatcat_tools.importers import OrcidImporter, JsonLinePusher from fixtures import api @@ -9,37 +9,46 @@ from fixtures import api def orcid_importer(api): yield OrcidImporter(api) -# TODO: use API to check that entities actually created... -def test_orcid_importer_batch(orcid_importer): - with open('tests/files/0000-0001-8254-7103.json', 'r') as f: - orcid_importer.process_batch(f) - def test_orcid_importer_badid(orcid_importer): with open('tests/files/0000-0001-8254-710X.json', 'r') as f: - orcid_importer.process_batch(f) + JsonLinePusher(orcid_importer, f).run() +# TODO: use API to check that entities actually created... def test_orcid_importer(orcid_importer): + last_index = orcid_importer.api.get_changelog(limit=1)[0].index with open('tests/files/0000-0001-8254-7103.json', 'r') as f: - orcid_importer.process_source(f) + orcid_importer.bezerk_mode = True + counts = JsonLinePusher(orcid_importer, f).run() + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 # fetch most recent editgroup - changes = orcid_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup + change = orcid_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup assert eg.description assert "orcid" in eg.description.lower() assert eg.extra['git_rev'] assert "fatcat_tools.OrcidImporter" in eg.extra['agent'] + with open('tests/files/0000-0001-8254-7103.json', 'r') as f: + orcid_importer.reset() + orcid_importer.bezerk_mode = False + counts = JsonLinePusher(orcid_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + def test_orcid_importer_x(orcid_importer): with open('tests/files/0000-0003-3953-765X.json', 'r') as f: - orcid_importer.process_source(f) + JsonLinePusher(orcid_importer, f).run() c = orcid_importer.api.lookup_creator(orcid="0000-0003-3953-765X") assert c is not None def test_orcid_dict_parse(orcid_importer): with open('tests/files/0000-0001-8254-7103.json', 'r') as f: raw = json.loads(f.readline()) - c = orcid_importer.parse_orcid_dict(raw) + c = orcid_importer.parse_record(raw) assert c.given_name == "Man-Hui" assert c.surname == "Li" assert c.display_name == "Man-Hui Li" diff --git a/python/tests/importer.py b/python/tests/importer.py index 34efa5d8..9308ba84 100644 --- a/python/tests/importer.py +++ b/python/tests/importer.py @@ -1,13 +1,13 @@ import pytest -from fatcat_tools.importers import FatcatImporter +from fatcat_tools.importers import CrossrefImporter, OrcidImporter from fixtures import api def test_issnl_mapping_lookup(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - fi = FatcatImporter(api, issn_map_file=issn_file) + fi = CrossrefImporter(api, issn_map_file=issn_file) assert fi.issn2issnl('0000-0027') == '0002-0027' assert fi.issn2issnl('0002-0027') == '0002-0027' @@ -18,20 +18,18 @@ def test_issnl_mapping_lookup(api): def test_identifiers(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - fi = FatcatImporter(api, issn_map_file=issn_file) - - assert fi.is_issnl("1234-5678") == True - assert fi.is_issnl("1234-5678.") == False - assert fi.is_issnl("12345678") == False - assert fi.is_issnl("1-2345678") == False - - assert fi.is_doi("10.1234/56789") == True - assert fi.is_doi("101234/56789") == False - assert fi.is_doi("10.1234_56789") == False - - assert fi.is_orcid("0000-0003-3118-6591") == True - assert fi.is_orcid("0000-0003-3953-765X") == True - assert fi.is_orcid("0000-00x3-3118-659") == False - assert fi.is_orcid("0000-00033118-659") == False - assert fi.is_orcid("0000-0003-3118-659.") == False + ci = CrossrefImporter(api, issn_map_file=issn_file) + + assert ci.is_issnl("1234-5678") == True + assert ci.is_issnl("1234-5678.") == False + assert ci.is_issnl("12345678") == False + assert ci.is_issnl("1-2345678") == False + + oi = OrcidImporter(api) + + assert oi.is_orcid("0000-0003-3118-6591") == True + assert oi.is_orcid("0000-0003-3953-765X") == True + assert oi.is_orcid("0000-00x3-3118-659") == False + assert oi.is_orcid("0000-00033118-659") == False + assert oi.is_orcid("0000-0003-3118-659.") == False diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py index e9d23250..6d6c6c82 100644 --- a/python/tests/transform_tests.py +++ b/python/tests/transform_tests.py @@ -11,7 +11,7 @@ def test_elasticsearch_convert(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: # not a single line raw = json.loads(f.read()) - (r, c) = crossref_importer.parse_crossref_dict(raw) + r = crossref_importer.parse_record(raw) r.state = 'active' release_to_elasticsearch(r) diff --git a/python_client/fatcat_client/api/default_api.py b/python_client/fatcat_client/api/default_api.py index c5b99d8a..75e5b0a8 100644 --- a/python_client/fatcat_client/api/default_api.py +++ b/python_client/fatcat_client/api/default_api.py @@ -445,6 +445,8 @@ class DefaultApi(object): :param list[ContainerEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. @@ -468,12 +470,14 @@ class DefaultApi(object): :param list[ContainerEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. """ - all_params = ['entity_list', 'autoaccept', 'editgroup_id'] # noqa: E501 + all_params = ['entity_list', 'autoaccept', 'editgroup_id', 'description', 'extra'] # noqa: E501 all_params.append('async') all_params.append('_return_http_data_only') all_params.append('_preload_content') @@ -502,6 +506,10 @@ class DefaultApi(object): query_params.append(('autoaccept', params['autoaccept'])) # noqa: E501 if 'editgroup_id' in params: query_params.append(('editgroup_id', params['editgroup_id'])) # noqa: E501 + if 'description' in params: + query_params.append(('description', params['description'])) # noqa: E501 + if 'extra' in params: + query_params.append(('extra', params['extra'])) # noqa: E501 header_params = {} @@ -655,6 +663,8 @@ class DefaultApi(object): :param list[CreatorEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. @@ -678,12 +688,14 @@ class DefaultApi(object): :param list[CreatorEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. """ - all_params = ['entity_list', 'autoaccept', 'editgroup_id'] # noqa: E501 + all_params = ['entity_list', 'autoaccept', 'editgroup_id', 'description', 'extra'] # noqa: E501 all_params.append('async') all_params.append('_return_http_data_only') all_params.append('_preload_content') @@ -712,6 +724,10 @@ class DefaultApi(object): query_params.append(('autoaccept', params['autoaccept'])) # noqa: E501 if 'editgroup_id' in params: query_params.append(('editgroup_id', params['editgroup_id'])) # noqa: E501 + if 'description' in params: + query_params.append(('description', params['description'])) # noqa: E501 + if 'extra' in params: + query_params.append(('extra', params['extra'])) # noqa: E501 header_params = {} @@ -1075,6 +1091,8 @@ class DefaultApi(object): :param list[FileEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. @@ -1098,12 +1116,14 @@ class DefaultApi(object): :param list[FileEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. """ - all_params = ['entity_list', 'autoaccept', 'editgroup_id'] # noqa: E501 + all_params = ['entity_list', 'autoaccept', 'editgroup_id', 'description', 'extra'] # noqa: E501 all_params.append('async') all_params.append('_return_http_data_only') all_params.append('_preload_content') @@ -1132,6 +1152,10 @@ class DefaultApi(object): query_params.append(('autoaccept', params['autoaccept'])) # noqa: E501 if 'editgroup_id' in params: query_params.append(('editgroup_id', params['editgroup_id'])) # noqa: E501 + if 'description' in params: + query_params.append(('description', params['description'])) # noqa: E501 + if 'extra' in params: + query_params.append(('extra', params['extra'])) # noqa: E501 header_params = {} @@ -1285,6 +1309,8 @@ class DefaultApi(object): :param list[FilesetEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. @@ -1308,12 +1334,14 @@ class DefaultApi(object): :param list[FilesetEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. """ - all_params = ['entity_list', 'autoaccept', 'editgroup_id'] # noqa: E501 + all_params = ['entity_list', 'autoaccept', 'editgroup_id', 'description', 'extra'] # noqa: E501 all_params.append('async') all_params.append('_return_http_data_only') all_params.append('_preload_content') @@ -1342,6 +1370,10 @@ class DefaultApi(object): query_params.append(('autoaccept', params['autoaccept'])) # noqa: E501 if 'editgroup_id' in params: query_params.append(('editgroup_id', params['editgroup_id'])) # noqa: E501 + if 'description' in params: + query_params.append(('description', params['description'])) # noqa: E501 + if 'extra' in params: + query_params.append(('extra', params['extra'])) # noqa: E501 header_params = {} @@ -1495,6 +1527,8 @@ class DefaultApi(object): :param list[ReleaseEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. @@ -1518,12 +1552,14 @@ class DefaultApi(object): :param list[ReleaseEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. """ - all_params = ['entity_list', 'autoaccept', 'editgroup_id'] # noqa: E501 + all_params = ['entity_list', 'autoaccept', 'editgroup_id', 'description', 'extra'] # noqa: E501 all_params.append('async') all_params.append('_return_http_data_only') all_params.append('_preload_content') @@ -1552,6 +1588,10 @@ class DefaultApi(object): query_params.append(('autoaccept', params['autoaccept'])) # noqa: E501 if 'editgroup_id' in params: query_params.append(('editgroup_id', params['editgroup_id'])) # noqa: E501 + if 'description' in params: + query_params.append(('description', params['description'])) # noqa: E501 + if 'extra' in params: + query_params.append(('extra', params['extra'])) # noqa: E501 header_params = {} @@ -1705,6 +1745,8 @@ class DefaultApi(object): :param list[WebcaptureEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. @@ -1728,12 +1770,14 @@ class DefaultApi(object): :param list[WebcaptureEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. """ - all_params = ['entity_list', 'autoaccept', 'editgroup_id'] # noqa: E501 + all_params = ['entity_list', 'autoaccept', 'editgroup_id', 'description', 'extra'] # noqa: E501 all_params.append('async') all_params.append('_return_http_data_only') all_params.append('_preload_content') @@ -1762,6 +1806,10 @@ class DefaultApi(object): query_params.append(('autoaccept', params['autoaccept'])) # noqa: E501 if 'editgroup_id' in params: query_params.append(('editgroup_id', params['editgroup_id'])) # noqa: E501 + if 'description' in params: + query_params.append(('description', params['description'])) # noqa: E501 + if 'extra' in params: + query_params.append(('extra', params['extra'])) # noqa: E501 header_params = {} @@ -1915,6 +1963,8 @@ class DefaultApi(object): :param list[WorkEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. @@ -1938,12 +1988,14 @@ class DefaultApi(object): :param list[WorkEntity] entity_list: (required) :param bool autoaccept: If true, and editor is authorized, batch is accepted all at once :param str editgroup_id: Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True) + :param str description: + :param str extra: :return: list[EntityEdit] If the method is called asynchronously, returns the request thread. """ - all_params = ['entity_list', 'autoaccept', 'editgroup_id'] # noqa: E501 + all_params = ['entity_list', 'autoaccept', 'editgroup_id', 'description', 'extra'] # noqa: E501 all_params.append('async') all_params.append('_return_http_data_only') all_params.append('_preload_content') @@ -1972,6 +2024,10 @@ class DefaultApi(object): query_params.append(('autoaccept', params['autoaccept'])) # noqa: E501 if 'editgroup_id' in params: query_params.append(('editgroup_id', params['editgroup_id'])) # noqa: E501 + if 'description' in params: + query_params.append(('description', params['description'])) # noqa: E501 + if 'extra' in params: + query_params.append(('extra', params['extra'])) # noqa: E501 header_params = {} @@ -8836,6 +8892,8 @@ class DefaultApi(object): :param str pmid: :param str pmcid: :param str core_id: + :param str arxiv_id: + :param str jstor_id: :param str expand: List of sub-entities to expand in response. :param str hide: List of sub-entities to expand in response. For releases, 'files', 'filesets, 'webcaptures', 'container', and 'creators' are valid. :return: ReleaseEntity @@ -8864,6 +8922,8 @@ class DefaultApi(object): :param str pmid: :param str pmcid: :param str core_id: + :param str arxiv_id: + :param str jstor_id: :param str expand: List of sub-entities to expand in response. :param str hide: List of sub-entities to expand in response. For releases, 'files', 'filesets, 'webcaptures', 'container', and 'creators' are valid. :return: ReleaseEntity @@ -8871,7 +8931,7 @@ class DefaultApi(object): returns the request thread. """ - all_params = ['doi', 'wikidata_qid', 'isbn13', 'pmid', 'pmcid', 'core_id', 'expand', 'hide'] # noqa: E501 + all_params = ['doi', 'wikidata_qid', 'isbn13', 'pmid', 'pmcid', 'core_id', 'arxiv_id', 'jstor_id', 'expand', 'hide'] # noqa: E501 all_params.append('async') all_params.append('_return_http_data_only') all_params.append('_preload_content') @@ -8904,6 +8964,10 @@ class DefaultApi(object): query_params.append(('pmcid', params['pmcid'])) # noqa: E501 if 'core_id' in params: query_params.append(('core_id', params['core_id'])) # noqa: E501 + if 'arxiv_id' in params: + query_params.append(('arxiv_id', params['arxiv_id'])) # noqa: E501 + if 'jstor_id' in params: + query_params.append(('jstor_id', params['jstor_id'])) # noqa: E501 if 'expand' in params: query_params.append(('expand', params['expand'])) # noqa: E501 if 'hide' in params: diff --git a/python_client/fatcat_client/models/container_entity.py b/python_client/fatcat_client/models/container_entity.py index 5ed7eb0a..f5a054c5 100644 --- a/python_client/fatcat_client/models/container_entity.py +++ b/python_client/fatcat_client/models/container_entity.py @@ -31,11 +31,10 @@ class ContainerEntity(object): and the value is json key in definition. """ swagger_types = { - 'coden': 'str', - 'abbrev': 'str', 'wikidata_qid': 'str', 'issnl': 'str', 'publisher': 'str', + 'container_type': 'str', 'name': 'str', 'edit_extra': 'object', 'extra': 'object', @@ -46,11 +45,10 @@ class ContainerEntity(object): } attribute_map = { - 'coden': 'coden', - 'abbrev': 'abbrev', 'wikidata_qid': 'wikidata_qid', 'issnl': 'issnl', 'publisher': 'publisher', + 'container_type': 'container_type', 'name': 'name', 'edit_extra': 'edit_extra', 'extra': 'extra', @@ -60,14 +58,13 @@ class ContainerEntity(object): 'state': 'state' } - def __init__(self, coden=None, abbrev=None, wikidata_qid=None, issnl=None, publisher=None, name=None, edit_extra=None, extra=None, redirect=None, revision=None, ident=None, state=None): # noqa: E501 + def __init__(self, wikidata_qid=None, issnl=None, publisher=None, container_type=None, name=None, edit_extra=None, extra=None, redirect=None, revision=None, ident=None, state=None): # noqa: E501 """ContainerEntity - a model defined in Swagger""" # noqa: E501 - self._coden = None - self._abbrev = None self._wikidata_qid = None self._issnl = None self._publisher = None + self._container_type = None self._name = None self._edit_extra = None self._extra = None @@ -77,16 +74,14 @@ class ContainerEntity(object): self._state = None self.discriminator = None - if coden is not None: - self.coden = coden - if abbrev is not None: - self.abbrev = abbrev if wikidata_qid is not None: self.wikidata_qid = wikidata_qid if issnl is not None: self.issnl = issnl if publisher is not None: self.publisher = publisher + if container_type is not None: + self.container_type = container_type if name is not None: self.name = name if edit_extra is not None: @@ -103,48 +98,6 @@ class ContainerEntity(object): self.state = state @property - def coden(self): - """Gets the coden of this ContainerEntity. # noqa: E501 - - - :return: The coden of this ContainerEntity. # noqa: E501 - :rtype: str - """ - return self._coden - - @coden.setter - def coden(self, coden): - """Sets the coden of this ContainerEntity. - - - :param coden: The coden of this ContainerEntity. # noqa: E501 - :type: str - """ - - self._coden = coden - - @property - def abbrev(self): - """Gets the abbrev of this ContainerEntity. # noqa: E501 - - - :return: The abbrev of this ContainerEntity. # noqa: E501 - :rtype: str - """ - return self._abbrev - - @abbrev.setter - def abbrev(self, abbrev): - """Sets the abbrev of this ContainerEntity. - - - :param abbrev: The abbrev of this ContainerEntity. # noqa: E501 - :type: str - """ - - self._abbrev = abbrev - - @property def wikidata_qid(self): """Gets the wikidata_qid of this ContainerEntity. # noqa: E501 @@ -214,6 +167,29 @@ class ContainerEntity(object): self._publisher = publisher @property + def container_type(self): + """Gets the container_type of this ContainerEntity. # noqa: E501 + + Eg, 'journal' # noqa: E501 + + :return: The container_type of this ContainerEntity. # noqa: E501 + :rtype: str + """ + return self._container_type + + @container_type.setter + def container_type(self, container_type): + """Sets the container_type of this ContainerEntity. + + Eg, 'journal' # noqa: E501 + + :param container_type: The container_type of this ContainerEntity. # noqa: E501 + :type: str + """ + + self._container_type = container_type + + @property def name(self): """Gets the name of this ContainerEntity. # noqa: E501 diff --git a/python_client/fatcat_client/models/editgroup.py b/python_client/fatcat_client/models/editgroup.py index 8339f2e3..2d0768a1 100644 --- a/python_client/fatcat_client/models/editgroup.py +++ b/python_client/fatcat_client/models/editgroup.py @@ -38,6 +38,7 @@ class Editgroup(object): 'editgroup_id': 'str', 'editor_id': 'str', 'editor': 'Editor', + 'changelog_index': 'int', 'submitted': 'datetime', 'description': 'str', 'extra': 'object', @@ -49,6 +50,7 @@ class Editgroup(object): 'editgroup_id': 'editgroup_id', 'editor_id': 'editor_id', 'editor': 'editor', + 'changelog_index': 'changelog_index', 'submitted': 'submitted', 'description': 'description', 'extra': 'extra', @@ -56,12 +58,13 @@ class Editgroup(object): 'edits': 'edits' } - def __init__(self, editgroup_id=None, editor_id=None, editor=None, submitted=None, description=None, extra=None, annotations=None, edits=None): # noqa: E501 + def __init__(self, editgroup_id=None, editor_id=None, editor=None, changelog_index=None, submitted=None, description=None, extra=None, annotations=None, edits=None): # noqa: E501 """Editgroup - a model defined in Swagger""" # noqa: E501 self._editgroup_id = None self._editor_id = None self._editor = None + self._changelog_index = None self._submitted = None self._description = None self._extra = None @@ -75,6 +78,8 @@ class Editgroup(object): self.editor_id = editor_id if editor is not None: self.editor = editor + if changelog_index is not None: + self.changelog_index = changelog_index if submitted is not None: self.submitted = submitted if description is not None: @@ -166,6 +171,27 @@ class Editgroup(object): self._editor = editor @property + def changelog_index(self): + """Gets the changelog_index of this Editgroup. # noqa: E501 + + + :return: The changelog_index of this Editgroup. # noqa: E501 + :rtype: int + """ + return self._changelog_index + + @changelog_index.setter + def changelog_index(self, changelog_index): + """Sets the changelog_index of this Editgroup. + + + :param changelog_index: The changelog_index of this Editgroup. # noqa: E501 + :type: int + """ + + self._changelog_index = changelog_index + + @property def submitted(self): """Gets the submitted of this Editgroup. # noqa: E501 diff --git a/python_client/fatcat_client/models/release_contrib.py b/python_client/fatcat_client/models/release_contrib.py index a06b0e66..e823e43e 100644 --- a/python_client/fatcat_client/models/release_contrib.py +++ b/python_client/fatcat_client/models/release_contrib.py @@ -37,8 +37,9 @@ class ReleaseContrib(object): 'creator_id': 'str', 'creator': 'CreatorEntity', 'raw_name': 'str', - 'extra': 'object', - 'role': 'str' + 'role': 'str', + 'raw_affiliation': 'str', + 'extra': 'object' } attribute_map = { @@ -46,19 +47,21 @@ class ReleaseContrib(object): 'creator_id': 'creator_id', 'creator': 'creator', 'raw_name': 'raw_name', - 'extra': 'extra', - 'role': 'role' + 'role': 'role', + 'raw_affiliation': 'raw_affiliation', + 'extra': 'extra' } - def __init__(self, index=None, creator_id=None, creator=None, raw_name=None, extra=None, role=None): # noqa: E501 + def __init__(self, index=None, creator_id=None, creator=None, raw_name=None, role=None, raw_affiliation=None, extra=None): # noqa: E501 """ReleaseContrib - a model defined in Swagger""" # noqa: E501 self._index = None self._creator_id = None self._creator = None self._raw_name = None - self._extra = None self._role = None + self._raw_affiliation = None + self._extra = None self.discriminator = None if index is not None: @@ -69,10 +72,12 @@ class ReleaseContrib(object): self.creator = creator if raw_name is not None: self.raw_name = raw_name - if extra is not None: - self.extra = extra if role is not None: self.role = role + if raw_affiliation is not None: + self.raw_affiliation = raw_affiliation + if extra is not None: + self.extra = extra @property def index(self): @@ -161,46 +166,69 @@ class ReleaseContrib(object): self._raw_name = raw_name @property - def extra(self): - """Gets the extra of this ReleaseContrib. # noqa: E501 + def role(self): + """Gets the role of this ReleaseContrib. # noqa: E501 - :return: The extra of this ReleaseContrib. # noqa: E501 - :rtype: object + :return: The role of this ReleaseContrib. # noqa: E501 + :rtype: str """ - return self._extra + return self._role - @extra.setter - def extra(self, extra): - """Sets the extra of this ReleaseContrib. + @role.setter + def role(self, role): + """Sets the role of this ReleaseContrib. - :param extra: The extra of this ReleaseContrib. # noqa: E501 - :type: object + :param role: The role of this ReleaseContrib. # noqa: E501 + :type: str """ - self._extra = extra + self._role = role @property - def role(self): - """Gets the role of this ReleaseContrib. # noqa: E501 + def raw_affiliation(self): + """Gets the raw_affiliation of this ReleaseContrib. # noqa: E501 + Raw affiliation string as displayed in text # noqa: E501 - :return: The role of this ReleaseContrib. # noqa: E501 + :return: The raw_affiliation of this ReleaseContrib. # noqa: E501 :rtype: str """ - return self._role + return self._raw_affiliation - @role.setter - def role(self, role): - """Sets the role of this ReleaseContrib. + @raw_affiliation.setter + def raw_affiliation(self, raw_affiliation): + """Sets the raw_affiliation of this ReleaseContrib. + Raw affiliation string as displayed in text # noqa: E501 - :param role: The role of this ReleaseContrib. # noqa: E501 + :param raw_affiliation: The raw_affiliation of this ReleaseContrib. # noqa: E501 :type: str """ - self._role = role + self._raw_affiliation = raw_affiliation + + @property + def extra(self): + """Gets the extra of this ReleaseContrib. # noqa: E501 + + + :return: The extra of this ReleaseContrib. # noqa: E501 + :rtype: object + """ + return self._extra + + @extra.setter + def extra(self, extra): + """Sets the extra of this ReleaseContrib. + + + :param extra: The extra of this ReleaseContrib. # noqa: E501 + :type: object + """ + + self._extra = extra def to_dict(self): """Returns the model properties as a dict""" diff --git a/python_client/fatcat_client/models/release_entity.py b/python_client/fatcat_client/models/release_entity.py index 83648351..5df97e50 100644 --- a/python_client/fatcat_client/models/release_entity.py +++ b/python_client/fatcat_client/models/release_entity.py @@ -42,11 +42,14 @@ class ReleaseEntity(object): 'abstracts': 'list[ReleaseEntityAbstracts]', 'refs': 'list[ReleaseRef]', 'contribs': 'list[ReleaseContrib]', + 'license_slug': 'str', 'language': 'str', 'publisher': 'str', 'pages': 'str', 'issue': 'str', 'volume': 'str', + 'jstor_id': 'str', + 'arxiv_id': 'str', 'core_id': 'str', 'pmcid': 'str', 'pmid': 'str', @@ -63,6 +66,7 @@ class ReleaseEntity(object): 'files': 'list[FileEntity]', 'container': 'ContainerEntity', 'work_id': 'str', + 'original_title': 'str', 'title': 'str', 'state': 'str', 'ident': 'str', @@ -76,11 +80,14 @@ class ReleaseEntity(object): 'abstracts': 'abstracts', 'refs': 'refs', 'contribs': 'contribs', + 'license_slug': 'license_slug', 'language': 'language', 'publisher': 'publisher', 'pages': 'pages', 'issue': 'issue', 'volume': 'volume', + 'jstor_id': 'jstor_id', + 'arxiv_id': 'arxiv_id', 'core_id': 'core_id', 'pmcid': 'pmcid', 'pmid': 'pmid', @@ -97,6 +104,7 @@ class ReleaseEntity(object): 'files': 'files', 'container': 'container', 'work_id': 'work_id', + 'original_title': 'original_title', 'title': 'title', 'state': 'state', 'ident': 'ident', @@ -106,17 +114,20 @@ class ReleaseEntity(object): 'edit_extra': 'edit_extra' } - def __init__(self, abstracts=None, refs=None, contribs=None, language=None, publisher=None, pages=None, issue=None, volume=None, core_id=None, pmcid=None, pmid=None, isbn13=None, wikidata_qid=None, doi=None, release_year=None, release_date=None, release_status=None, release_type=None, container_id=None, webcaptures=None, filesets=None, files=None, container=None, work_id=None, title=None, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None): # noqa: E501 + def __init__(self, abstracts=None, refs=None, contribs=None, license_slug=None, language=None, publisher=None, pages=None, issue=None, volume=None, jstor_id=None, arxiv_id=None, core_id=None, pmcid=None, pmid=None, isbn13=None, wikidata_qid=None, doi=None, release_year=None, release_date=None, release_status=None, release_type=None, container_id=None, webcaptures=None, filesets=None, files=None, container=None, work_id=None, original_title=None, title=None, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None): # noqa: E501 """ReleaseEntity - a model defined in Swagger""" # noqa: E501 self._abstracts = None self._refs = None self._contribs = None + self._license_slug = None self._language = None self._publisher = None self._pages = None self._issue = None self._volume = None + self._jstor_id = None + self._arxiv_id = None self._core_id = None self._pmcid = None self._pmid = None @@ -133,6 +144,7 @@ class ReleaseEntity(object): self._files = None self._container = None self._work_id = None + self._original_title = None self._title = None self._state = None self._ident = None @@ -148,6 +160,8 @@ class ReleaseEntity(object): self.refs = refs if contribs is not None: self.contribs = contribs + if license_slug is not None: + self.license_slug = license_slug if language is not None: self.language = language if publisher is not None: @@ -158,6 +172,10 @@ class ReleaseEntity(object): self.issue = issue if volume is not None: self.volume = volume + if jstor_id is not None: + self.jstor_id = jstor_id + if arxiv_id is not None: + self.arxiv_id = arxiv_id if core_id is not None: self.core_id = core_id if pmcid is not None: @@ -190,6 +208,8 @@ class ReleaseEntity(object): self.container = container if work_id is not None: self.work_id = work_id + if original_title is not None: + self.original_title = original_title if title is not None: self.title = title if state is not None: @@ -269,6 +289,29 @@ class ReleaseEntity(object): self._contribs = contribs @property + def license_slug(self): + """Gets the license_slug of this ReleaseEntity. # noqa: E501 + + Short version of license name. Eg, 'CC-BY' # noqa: E501 + + :return: The license_slug of this ReleaseEntity. # noqa: E501 + :rtype: str + """ + return self._license_slug + + @license_slug.setter + def license_slug(self, license_slug): + """Sets the license_slug of this ReleaseEntity. + + Short version of license name. Eg, 'CC-BY' # noqa: E501 + + :param license_slug: The license_slug of this ReleaseEntity. # noqa: E501 + :type: str + """ + + self._license_slug = license_slug + + @property def language(self): """Gets the language of this ReleaseEntity. # noqa: E501 @@ -376,6 +419,48 @@ class ReleaseEntity(object): self._volume = volume @property + def jstor_id(self): + """Gets the jstor_id of this ReleaseEntity. # noqa: E501 + + + :return: The jstor_id of this ReleaseEntity. # noqa: E501 + :rtype: str + """ + return self._jstor_id + + @jstor_id.setter + def jstor_id(self, jstor_id): + """Sets the jstor_id of this ReleaseEntity. + + + :param jstor_id: The jstor_id of this ReleaseEntity. # noqa: E501 + :type: str + """ + + self._jstor_id = jstor_id + + @property + def arxiv_id(self): + """Gets the arxiv_id of this ReleaseEntity. # noqa: E501 + + + :return: The arxiv_id of this ReleaseEntity. # noqa: E501 + :rtype: str + """ + return self._arxiv_id + + @arxiv_id.setter + def arxiv_id(self, arxiv_id): + """Sets the arxiv_id of this ReleaseEntity. + + + :param arxiv_id: The arxiv_id of this ReleaseEntity. # noqa: E501 + :type: str + """ + + self._arxiv_id = arxiv_id + + @property def core_id(self): """Gets the core_id of this ReleaseEntity. # noqa: E501 @@ -720,10 +805,33 @@ class ReleaseEntity(object): self._work_id = work_id @property + def original_title(self): + """Gets the original_title of this ReleaseEntity. # noqa: E501 + + Title in original language (or, the language of the full text of this release) # noqa: E501 + + :return: The original_title of this ReleaseEntity. # noqa: E501 + :rtype: str + """ + return self._original_title + + @original_title.setter + def original_title(self, original_title): + """Sets the original_title of this ReleaseEntity. + + Title in original language (or, the language of the full text of this release) # noqa: E501 + + :param original_title: The original_title of this ReleaseEntity. # noqa: E501 + :type: str + """ + + self._original_title = original_title + + @property def title(self): """Gets the title of this ReleaseEntity. # noqa: E501 - Required for valid entities # noqa: E501 + Required for valid entities. The title used in citations and for display; usually English # noqa: E501 :return: The title of this ReleaseEntity. # noqa: E501 :rtype: str @@ -734,7 +842,7 @@ class ReleaseEntity(object): def title(self, title): """Sets the title of this ReleaseEntity. - Required for valid entities # noqa: E501 + Required for valid entities. The title used in citations and for display; usually English # noqa: E501 :param title: The title of this ReleaseEntity. # noqa: E501 :type: str diff --git a/python_client/fatcat_client/models/webcapture_entity.py b/python_client/fatcat_client/models/webcapture_entity.py index 9d49c916..6db49186 100644 --- a/python_client/fatcat_client/models/webcapture_entity.py +++ b/python_client/fatcat_client/models/webcapture_entity.py @@ -125,6 +125,7 @@ class WebcaptureEntity(object): def timestamp(self): """Gets the timestamp of this WebcaptureEntity. # noqa: E501 + same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Can be the earliest or average of CDX timestamps if that makes sense. # noqa: E501 :return: The timestamp of this WebcaptureEntity. # noqa: E501 :rtype: datetime @@ -135,6 +136,7 @@ class WebcaptureEntity(object): def timestamp(self, timestamp): """Sets the timestamp of this WebcaptureEntity. + same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Can be the earliest or average of CDX timestamps if that makes sense. # noqa: E501 :param timestamp: The timestamp of this WebcaptureEntity. # noqa: E501 :type: datetime diff --git a/python_client/fatcat_client/models/webcapture_entity_cdx.py b/python_client/fatcat_client/models/webcapture_entity_cdx.py index a34ea2f1..082c1b96 100644 --- a/python_client/fatcat_client/models/webcapture_entity_cdx.py +++ b/python_client/fatcat_client/models/webcapture_entity_cdx.py @@ -32,7 +32,7 @@ class WebcaptureEntityCdx(object): """ swagger_types = { 'surt': 'str', - 'timestamp': 'str', + 'timestamp': 'datetime', 'url': 'str', 'mimetype': 'str', 'status_code': 'int', @@ -100,9 +100,10 @@ class WebcaptureEntityCdx(object): def timestamp(self): """Gets the timestamp of this WebcaptureEntityCdx. # noqa: E501 + UTC, 'Z'-terminated, second (or better) precision # noqa: E501 :return: The timestamp of this WebcaptureEntityCdx. # noqa: E501 - :rtype: str + :rtype: datetime """ return self._timestamp @@ -110,9 +111,10 @@ class WebcaptureEntityCdx(object): def timestamp(self, timestamp): """Sets the timestamp of this WebcaptureEntityCdx. + UTC, 'Z'-terminated, second (or better) precision # noqa: E501 :param timestamp: The timestamp of this WebcaptureEntityCdx. # noqa: E501 - :type: str + :type: datetime """ if timestamp is None: raise ValueError("Invalid value for `timestamp`, must not be `None`") # noqa: E501 diff --git a/rust/Cargo.lock b/rust/Cargo.lock index e89954ad..c0df5a2a 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -538,6 +538,9 @@ dependencies = [ "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "sentry 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_ignored 0.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)", "sha1 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", "slog 2.4.1 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 155e3c8a..c5a52845 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -37,6 +37,9 @@ slog = "^2.0" slog-term = "*" slog-async = "*" serde_json = "1.0" +serde = "*" +serde_derive = "1.0" +serde_ignored = "0.0.4" sentry = { version = "^0.12", default-features = false, features = ["with_client_implementation", "with_backtrace", "with_panic", "with_log", "with_rust_info", "with_failure"] } cadence = "^0.16" diff --git a/rust/README.export.md b/rust/README.export.md index cee361c0..97c2c028 100644 --- a/rust/README.export.md +++ b/rust/README.export.md @@ -9,5 +9,5 @@ Then dump: Or, perhaps, in production: - cat /tmp/fatcat_ident_releases.tsv | ./target/release/fatcat-export release --expand files,container -j8 | pv -l | gzip > release_export_expanded.json.gz + cat /tmp/fatcat_ident_releases.tsv | ./target/release/fatcat-export release --expand files,filesets,webcaptures,container -j8 | pv -l | gzip > release_export_expanded.json.gz diff --git a/rust/codegen_openapi2.sh b/rust/codegen_openapi2.sh index f7d1df53..a3950c58 100755 --- a/rust/codegen_openapi2.sh +++ b/rust/codegen_openapi2.sh @@ -11,4 +11,16 @@ sed -i 's/extern crate uuid;/extern crate serde_json;\nextern crate uuid;/g' fat # Hack to fix "release_date" as Date, not DateTime sed -i 's/release_date: Option<chrono::DateTime<chrono::Utc>>/release_date: Option<chrono::NaiveDate>/g' fatcat-api-spec/src/models.rs +# Hack to require that optional params parse correctly (boolean, integer, datetime) +# If we reformat, this this should basically go from, eg: +# .and_then(|x| x.parse::<i64>() +# .ok()); +# To: +# .and_then(|x| Some(x.parse::<i64>())) +# .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) +# .map_err(|x| Response::with((status::InternalServerError, "unparsable query parameter (expected integer)".to_string())))?; +sed -i 's/.and_then(|x| x.parse::<i64>().ok());$/.and_then(|x| Some(x.parse::<i64>())).map_or_else(|| Ok(None), |x| x.map(|v| Some(v))).map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?;/g' fatcat-api-spec/src/server.rs +sed -i 's/.and_then(|x| x.parse::<bool>().ok());$/.and_then(|x| Some(x.to_lowercase().parse::<bool>())).map_or_else(|| Ok(None), |x| x.map(|v| Some(v))).map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected boolean)".to_string())))?;/g' fatcat-api-spec/src/server.rs +sed -i 's/.and_then(|x| x.parse::<chrono::DateTime<chrono::Utc>>().ok());$/.and_then(|x| Some(x.parse::<chrono::DateTime<chrono::Utc>>())).map_or_else(|| Ok(None), |x| x.map(|v| Some(v))).map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected UTC datetime in ISO\/RFC format)".to_string())))?;/g' fatcat-api-spec/src/server.rs + cargo fmt diff --git a/rust/fatcat-api-spec/README.md b/rust/fatcat-api-spec/README.md index ee031f66..cacded6a 100644 --- a/rust/fatcat-api-spec/README.md +++ b/rust/fatcat-api-spec/README.md @@ -13,7 +13,7 @@ To see how to make this your own, look here: [README](https://github.com/swagger-api/swagger-codegen/blob/master/README.md) - API version: 0.1.0 -- Build date: 2019-01-11T23:46:50.303Z +- Build date: 2019-01-23T05:30:23.378Z This autogenerated project defines an API crate `fatcat` which contains: * An `Api` trait defining the API in Rust. diff --git a/rust/fatcat-api-spec/api.yaml b/rust/fatcat-api-spec/api.yaml index 70a27b76..14b70c39 100644 --- a/rust/fatcat-api-spec/api.yaml +++ b/rust/fatcat-api-spec/api.yaml @@ -26,6 +26,10 @@ tags: # TAGLINE descriptions: "Creator entities: such as authors" # TAGLINE - name: files # TAGLINE descriptions: "File entities" # TAGLINE + - name: filesets # TAGLINE + descriptions: "Fileset entities" # TAGLINE + - name: webcaptures # TAGLINE + descriptions: "Webcapture entities" # TAGLINE - name: releases # TAGLINE descriptions: "Release entities: individual articles, pre-prints, books" # TAGLINE - name: works # TAGLINE @@ -132,6 +136,9 @@ definitions: type: string example: "Journal of Important Results" description: "Required for valid entities" + container_type: + type: string + description: "Eg, 'journal'" publisher: type: string example: "Society of Curious Students" @@ -139,10 +146,6 @@ definitions: <<: *FATCATISSN wikidata_qid: type: string - abbrev: - type: string - coden: - type: string creator_entity: type: object # required for creation: display_name @@ -249,7 +252,7 @@ definitions: properties: <<: *ENTITYPROPS cdx: - # limit of 200 CDX lines, at least to start + # limit of 200 CDX lines, at least to start? type: array items: type: object @@ -264,7 +267,9 @@ definitions: example: "org,asheesh)/apus/ch1/node15.html" timestamp: type: string - example: "20020429162520" + format: date-time + example: "2016-09-19T17:20:24Z" + description: "UTC, 'Z'-terminated, second (or better) precision" url: type: string # NOTE: not format:url to allow alternatives @@ -302,6 +307,7 @@ definitions: timestamp: type: string format: date-time + description: "same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Can be the earliest or average of CDX timestamps if that makes sense." release_ids: type: array items: @@ -313,7 +319,10 @@ definitions: <<: *ENTITYPROPS title: type: string - description: "Required for valid entities" + description: "Required for valid entities. The title used in citations and for display; usually English" + original_title: + type: string + description: "Title in original language (or, the language of the full text of this release)" work_id: type: string example: "q3nouwy3nnbsvo3h5klxsx4a7y" @@ -343,7 +352,7 @@ definitions: example: "book" release_status: type: string - example: "preprint" + example: "preprint, retracted" release_date: type: string format: date @@ -367,6 +376,10 @@ definitions: core_id: type: string #format: custom + arxiv_id: + type: string + jstor_id: + type: string volume: type: string issue: @@ -379,6 +392,9 @@ definitions: language: description: "Two-letter RFC1766/ISO639-1 language code, with extensions" type: string + license_slug: + type: string + description: "Short version of license name. Eg, 'CC-BY'" contribs: type: array items: @@ -588,11 +604,14 @@ definitions: description: "Optional; GET-only" raw_name: type: string + role: + type: string + raw_affiliation: + type: string + description: "Raw affiliation string as displayed in text" extra: type: object additionalProperties: {} - role: - type: string auth_oidc: type: object required: @@ -687,6 +706,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -945,6 +972,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -1226,6 +1261,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -1489,6 +1532,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -1718,6 +1769,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -1947,6 +2006,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true @@ -2201,6 +2268,14 @@ paths: in: query type: string required: false + - name: arxiv_id + in: query + type: string + required: false + - name: jstor_id + in: query + type: string + required: false - name: expand in: query type: string @@ -2291,6 +2366,14 @@ paths: type: string required: false description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)" + - name: description + in: query + type: string + required: false + - name: extra + in: query + type: string + required: false - name: entity_list in: body required: true diff --git a/rust/fatcat-api-spec/api/swagger.yaml b/rust/fatcat-api-spec/api/swagger.yaml index 7a1c2755..927bb941 100644 --- a/rust/fatcat-api-spec/api/swagger.yaml +++ b/rust/fatcat-api-spec/api/swagger.yaml @@ -11,6 +11,8 @@ tags: - name: "containers" - name: "creators" - name: "files" +- name: "filesets" +- name: "webcaptures" - name: "releases" - name: "works" - name: "edit-lifecycle" @@ -133,6 +135,18 @@ paths: type: "string" formatString: "{:?}" example: "Some(\"editgroup_id_example\".to_string())" + - name: "description" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"description_example\".to_string())" + - name: "extra" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"extra_example\".to_string())" - in: "body" name: "entity_list" required: true @@ -979,6 +993,18 @@ paths: type: "string" formatString: "{:?}" example: "Some(\"editgroup_id_example\".to_string())" + - name: "description" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"description_example\".to_string())" + - name: "extra" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"extra_example\".to_string())" - in: "body" name: "entity_list" required: true @@ -1889,6 +1915,18 @@ paths: type: "string" formatString: "{:?}" example: "Some(\"editgroup_id_example\".to_string())" + - name: "description" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"description_example\".to_string())" + - name: "extra" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"extra_example\".to_string())" - in: "body" name: "entity_list" required: true @@ -2748,6 +2786,18 @@ paths: type: "string" formatString: "{:?}" example: "Some(\"editgroup_id_example\".to_string())" + - name: "description" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"description_example\".to_string())" + - name: "extra" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"extra_example\".to_string())" - in: "body" name: "entity_list" required: true @@ -3517,6 +3567,18 @@ paths: type: "string" formatString: "{:?}" example: "Some(\"editgroup_id_example\".to_string())" + - name: "description" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"description_example\".to_string())" + - name: "extra" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"extra_example\".to_string())" - in: "body" name: "entity_list" required: true @@ -4286,6 +4348,18 @@ paths: type: "string" formatString: "{:?}" example: "Some(\"editgroup_id_example\".to_string())" + - name: "description" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"description_example\".to_string())" + - name: "extra" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"extra_example\".to_string())" - in: "body" name: "entity_list" required: true @@ -5039,6 +5113,18 @@ paths: type: "string" formatString: "{:?}" example: "Some(\"core_id_example\".to_string())" + - name: "arxiv_id" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"arxiv_id_example\".to_string())" + - name: "jstor_id" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"jstor_id_example\".to_string())" - name: "expand" in: "query" description: "List of sub-entities to expand in response." @@ -5346,6 +5432,18 @@ paths: type: "string" formatString: "{:?}" example: "Some(\"editgroup_id_example\".to_string())" + - name: "description" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"description_example\".to_string())" + - name: "extra" + in: "query" + required: false + type: "string" + formatString: "{:?}" + example: "Some(\"extra_example\".to_string())" - in: "body" name: "entity_list" required: true @@ -7273,10 +7371,6 @@ definitions: container_entity: type: "object" properties: - coden: - type: "string" - abbrev: - type: "string" wikidata_qid: type: "string" issnl: @@ -7288,6 +7382,9 @@ definitions: publisher: type: "string" example: "Society of Curious Students" + container_type: + type: "string" + description: "Eg, 'journal'" name: type: "string" example: "Journal of Important Results" @@ -7326,13 +7423,12 @@ definitions: - "deleted" example: redirect: "q3nouwy3nnbsvo3h5klxsx4a7y" - coden: "coden" ident: "q3nouwy3nnbsvo3h5klxsx4a7y" extra: "{}" + container_type: "container_type" name: "Journal of Important Results" publisher: "Society of Curious Students" issnl: "1234-5678" - abbrev: "abbrev" wikidata_qid: "wikidata_qid" state: "wip" edit_extra: "{}" @@ -7592,6 +7688,9 @@ definitions: timestamp: type: "string" format: "date-time" + description: "same format as CDX line timestamp (UTC, etc). Corresponds to\ + \ the overall capture timestamp. Can be the earliest or average of CDX timestamps\ + \ if that makes sense." original_url: type: "string" format: "url" @@ -7651,14 +7750,14 @@ definitions: sha256: "cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452" mimetype: "text/html" url: "http://www.asheesh.org:80/APUS/ch1/node15.html" - timestamp: "20020429162520" + timestamp: "2016-09-19T17:20:24Z" - sha1: "e9dd75237c94b209dc3ccd52722de6931a310ba3" surt: "org,asheesh)/apus/ch1/node15.html" status_code: 200 sha256: "cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452" mimetype: "text/html" url: "http://www.asheesh.org:80/APUS/ch1/node15.html" - timestamp: "20020429162520" + timestamp: "2016-09-19T17:20:24Z" ident: "q3nouwy3nnbsvo3h5klxsx4a7y" extra: "{}" state: "wip" @@ -7684,6 +7783,9 @@ definitions: type: "array" items: $ref: "#/definitions/release_contrib" + license_slug: + type: "string" + description: "Short version of license name. Eg, 'CC-BY'" language: type: "string" description: "Two-letter RFC1766/ISO639-1 language code, with extensions" @@ -7696,6 +7798,10 @@ definitions: example: "12" volume: type: "string" + jstor_id: + type: "string" + arxiv_id: + type: "string" core_id: type: "string" pmcid: @@ -7718,7 +7824,7 @@ definitions: format: "date" release_status: type: "string" - example: "preprint" + example: "preprint, retracted" release_type: type: "string" example: "book" @@ -7746,9 +7852,14 @@ definitions: work_id: type: "string" example: "q3nouwy3nnbsvo3h5klxsx4a7y" + original_title: + type: "string" + description: "Title in original language (or, the language of the full text\ + \ of this release)" title: type: "string" - description: "Required for valid entities" + description: "Required for valid entities. The title used in citations and\ + \ for display; usually English" state: type: "string" enum: @@ -7784,13 +7895,12 @@ definitions: example: container: redirect: "q3nouwy3nnbsvo3h5klxsx4a7y" - coden: "coden" ident: "q3nouwy3nnbsvo3h5klxsx4a7y" extra: "{}" + container_type: "container_type" name: "Journal of Important Results" publisher: "Society of Curious Students" issnl: "1234-5678" - abbrev: "abbrev" wikidata_qid: "wikidata_qid" state: "wip" edit_extra: "{}" @@ -7810,14 +7920,14 @@ definitions: sha256: "cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452" mimetype: "text/html" url: "http://www.asheesh.org:80/APUS/ch1/node15.html" - timestamp: "20020429162520" + timestamp: "2016-09-19T17:20:24Z" - sha1: "e9dd75237c94b209dc3ccd52722de6931a310ba3" surt: "org,asheesh)/apus/ch1/node15.html" status_code: 200 sha256: "cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452" mimetype: "text/html" url: "http://www.asheesh.org:80/APUS/ch1/node15.html" - timestamp: "20020429162520" + timestamp: "2016-09-19T17:20:24Z" ident: "q3nouwy3nnbsvo3h5klxsx4a7y" extra: "{}" state: "wip" @@ -7841,14 +7951,14 @@ definitions: sha256: "cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452" mimetype: "text/html" url: "http://www.asheesh.org:80/APUS/ch1/node15.html" - timestamp: "20020429162520" + timestamp: "2016-09-19T17:20:24Z" - sha1: "e9dd75237c94b209dc3ccd52722de6931a310ba3" surt: "org,asheesh)/apus/ch1/node15.html" status_code: 200 sha256: "cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452" mimetype: "text/html" url: "http://www.asheesh.org:80/APUS/ch1/node15.html" - timestamp: "20020429162520" + timestamp: "2016-09-19T17:20:24Z" ident: "q3nouwy3nnbsvo3h5klxsx4a7y" extra: "{}" state: "wip" @@ -7862,7 +7972,8 @@ definitions: language: "language" title: "title" contribs: - - creator: + - raw_affiliation: "raw_affiliation" + creator: redirect: "q3nouwy3nnbsvo3h5klxsx4a7y" surname: "surname" ident: "q3nouwy3nnbsvo3h5klxsx4a7y" @@ -7879,7 +7990,8 @@ definitions: extra: "{}" creator_id: "creator_id" index: 1 - - creator: + - raw_affiliation: "raw_affiliation" + creator: redirect: "q3nouwy3nnbsvo3h5klxsx4a7y" surname: "surname" ident: "q3nouwy3nnbsvo3h5klxsx4a7y" @@ -7901,9 +8013,11 @@ definitions: extra: "{}" state: "wip" edit_extra: "{}" + jstor_id: "jstor_id" redirect: "q3nouwy3nnbsvo3h5klxsx4a7y" work_id: "q3nouwy3nnbsvo3h5klxsx4a7y" issue: "12" + original_title: "original_title" abstracts: - sha1: "e9dd75237c94b209dc3ccd52722de6931a310ba3" mimetype: "application/xml+jats" @@ -7917,9 +8031,10 @@ definitions: release_type: "book" wikidata_qid: "wikidata_qid" pmid: "pmid" - release_status: "preprint" + release_status: "preprint, retracted" revision: "86daea5b-1b6b-432a-bb67-ea97795f80fe" volume: "volume" + license_slug: "license_slug" refs: - target_release_id: "q3nouwy3nnbsvo3h5klxsx4a7y" container_name: "container_name" @@ -7979,6 +8094,7 @@ definitions: - "q3nouwy3nnbsvo3h5klxsx4a7y" edit_extra: "{}" md5: "1b39813549077b2347c0f370c3864b40" + arxiv_id: "arxiv_id" filesets: - redirect: "q3nouwy3nnbsvo3h5klxsx4a7y" urls: @@ -8929,11 +9045,15 @@ definitions: $ref: "#/definitions/creator_entity" raw_name: type: "string" - extra: - type: "object" role: type: "string" + raw_affiliation: + type: "string" + description: "Raw affiliation string as displayed in text" + extra: + type: "object" example: + raw_affiliation: "raw_affiliation" creator: redirect: "q3nouwy3nnbsvo3h5klxsx4a7y" surname: "surname" @@ -9072,7 +9192,9 @@ definitions: example: "org,asheesh)/apus/ch1/node15.html" timestamp: type: "string" - example: "20020429162520" + format: "date-time" + example: "2016-09-19T17:20:24Z" + description: "UTC, 'Z'-terminated, second (or better) precision" url: type: "string" example: "http://www.asheesh.org:80/APUS/ch1/node15.html" @@ -9102,7 +9224,7 @@ definitions: sha256: "cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452" mimetype: "text/html" url: "http://www.asheesh.org:80/APUS/ch1/node15.html" - timestamp: "20020429162520" + timestamp: "2016-09-19T17:20:24Z" upperCaseName: "WEBCAPTURE_ENTITY_CDX" release_entity_abstracts: properties: diff --git a/rust/fatcat-api-spec/examples/client.rs b/rust/fatcat-api-spec/examples/client.rs index b4d90719..40a5a3ab 100644 --- a/rust/fatcat-api-spec/examples/client.rs +++ b/rust/fatcat-api-spec/examples/client.rs @@ -144,7 +144,15 @@ fn main() { // println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); // }, Some("CreateContainerBatch") => { - let result = client.create_container_batch(&Vec::new(), Some(true), Some("editgroup_id_example".to_string())).wait(); + let result = client + .create_container_batch( + &Vec::new(), + Some(true), + Some("editgroup_id_example".to_string()), + Some("description_example".to_string()), + Some("extra_example".to_string()), + ) + .wait(); println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); } @@ -211,7 +219,15 @@ fn main() { // println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); // }, Some("CreateCreatorBatch") => { - let result = client.create_creator_batch(&Vec::new(), Some(true), Some("editgroup_id_example".to_string())).wait(); + let result = client + .create_creator_batch( + &Vec::new(), + Some(true), + Some("editgroup_id_example".to_string()), + Some("description_example".to_string()), + Some("extra_example".to_string()), + ) + .wait(); println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); } @@ -359,7 +375,15 @@ fn main() { // println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); // }, Some("CreateFileBatch") => { - let result = client.create_file_batch(&Vec::new(), Some(true), Some("editgroup_id_example".to_string())).wait(); + let result = client + .create_file_batch( + &Vec::new(), + Some(true), + Some("editgroup_id_example".to_string()), + Some("description_example".to_string()), + Some("extra_example".to_string()), + ) + .wait(); println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); } @@ -427,7 +451,15 @@ fn main() { // println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); // }, Some("CreateFilesetBatch") => { - let result = client.create_fileset_batch(&Vec::new(), Some(true), Some("editgroup_id_example".to_string())).wait(); + let result = client + .create_fileset_batch( + &Vec::new(), + Some(true), + Some("editgroup_id_example".to_string()), + Some("description_example".to_string()), + Some("extra_example".to_string()), + ) + .wait(); println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); } @@ -482,7 +514,15 @@ fn main() { // println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); // }, Some("CreateReleaseBatch") => { - let result = client.create_release_batch(&Vec::new(), Some(true), Some("editgroup_id_example".to_string())).wait(); + let result = client + .create_release_batch( + &Vec::new(), + Some(true), + Some("editgroup_id_example".to_string()), + Some("description_example".to_string()), + Some("extra_example".to_string()), + ) + .wait(); println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); } @@ -554,6 +594,8 @@ fn main() { Some("pmid_example".to_string()), Some("pmcid_example".to_string()), Some("core_id_example".to_string()), + Some("arxiv_id_example".to_string()), + Some("jstor_id_example".to_string()), Some("expand_example".to_string()), Some("hide_example".to_string()), ) @@ -573,7 +615,15 @@ fn main() { // println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); // }, Some("CreateWebcaptureBatch") => { - let result = client.create_webcapture_batch(&Vec::new(), Some(true), Some("editgroup_id_example".to_string())).wait(); + let result = client + .create_webcapture_batch( + &Vec::new(), + Some(true), + Some("editgroup_id_example".to_string()), + Some("description_example".to_string()), + Some("extra_example".to_string()), + ) + .wait(); println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); } @@ -622,7 +672,15 @@ fn main() { // println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); // }, Some("CreateWorkBatch") => { - let result = client.create_work_batch(&Vec::new(), Some(true), Some("editgroup_id_example".to_string())).wait(); + let result = client + .create_work_batch( + &Vec::new(), + Some(true), + Some("editgroup_id_example".to_string()), + Some("description_example".to_string()), + Some("extra_example".to_string()), + ) + .wait(); println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>"))); } diff --git a/rust/fatcat-api-spec/examples/server_lib/server.rs b/rust/fatcat-api-spec/examples/server_lib/server.rs index 3c37106a..98f31485 100644 --- a/rust/fatcat-api-spec/examples/server_lib/server.rs +++ b/rust/fatcat-api-spec/examples/server_lib/server.rs @@ -46,14 +46,18 @@ impl Api for Server { entity_list: &Vec<models::ContainerEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send> { let context = context.clone(); println!( - "create_container_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}", + "create_container_batch({:?}, {:?}, {:?}, {:?}, {:?}) - X-Span-ID: {:?}", entity_list, autoaccept, editgroup_id, + description, + extra, context.x_span_id.unwrap_or(String::from("<none>")).clone() ); Box::new(futures::failed("Generic failure".into())) @@ -171,14 +175,18 @@ impl Api for Server { entity_list: &Vec<models::CreatorEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send> { let context = context.clone(); println!( - "create_creator_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}", + "create_creator_batch({:?}, {:?}, {:?}, {:?}, {:?}) - X-Span-ID: {:?}", entity_list, autoaccept, editgroup_id, + description, + extra, context.x_span_id.unwrap_or(String::from("<none>")).clone() ); Box::new(futures::failed("Generic failure".into())) @@ -465,14 +473,18 @@ impl Api for Server { entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send> { let context = context.clone(); println!( - "create_file_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}", + "create_file_batch({:?}, {:?}, {:?}, {:?}, {:?}) - X-Span-ID: {:?}", entity_list, autoaccept, editgroup_id, + description, + extra, context.x_span_id.unwrap_or(String::from("<none>")).clone() ); Box::new(futures::failed("Generic failure".into())) @@ -592,14 +604,18 @@ impl Api for Server { entity_list: &Vec<models::FilesetEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateFilesetBatchResponse, Error = ApiError> + Send> { let context = context.clone(); println!( - "create_fileset_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}", + "create_fileset_batch({:?}, {:?}, {:?}, {:?}, {:?}) - X-Span-ID: {:?}", entity_list, autoaccept, editgroup_id, + description, + extra, context.x_span_id.unwrap_or(String::from("<none>")).clone() ); Box::new(futures::failed("Generic failure".into())) @@ -697,14 +713,18 @@ impl Api for Server { entity_list: &Vec<models::ReleaseEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send> { let context = context.clone(); println!( - "create_release_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}", + "create_release_batch({:?}, {:?}, {:?}, {:?}, {:?}) - X-Span-ID: {:?}", entity_list, autoaccept, editgroup_id, + description, + extra, context.x_span_id.unwrap_or(String::from("<none>")).clone() ); Box::new(futures::failed("Generic failure".into())) @@ -826,19 +846,23 @@ impl Api for Server { pmid: Option<String>, pmcid: Option<String>, core_id: Option<String>, + arxiv_id: Option<String>, + jstor_id: Option<String>, expand: Option<String>, hide: Option<String>, context: &Context, ) -> Box<Future<Item = LookupReleaseResponse, Error = ApiError> + Send> { let context = context.clone(); println!( - "lookup_release({:?}, {:?}, {:?}, {:?}, {:?}, {:?}, {:?}, {:?}) - X-Span-ID: {:?}", + "lookup_release({:?}, {:?}, {:?}, {:?}, {:?}, {:?}, {:?}, {:?}, {:?}, {:?}) - X-Span-ID: {:?}", doi, wikidata_qid, isbn13, pmid, pmcid, core_id, + arxiv_id, + jstor_id, expand, hide, context.x_span_id.unwrap_or(String::from("<none>")).clone() @@ -874,14 +898,18 @@ impl Api for Server { entity_list: &Vec<models::WebcaptureEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateWebcaptureBatchResponse, Error = ApiError> + Send> { let context = context.clone(); println!( - "create_webcapture_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}", + "create_webcapture_batch({:?}, {:?}, {:?}, {:?}, {:?}) - X-Span-ID: {:?}", entity_list, autoaccept, editgroup_id, + description, + extra, context.x_span_id.unwrap_or(String::from("<none>")).clone() ); Box::new(futures::failed("Generic failure".into())) @@ -968,14 +996,18 @@ impl Api for Server { entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send> { let context = context.clone(); println!( - "create_work_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}", + "create_work_batch({:?}, {:?}, {:?}, {:?}, {:?}) - X-Span-ID: {:?}", entity_list, autoaccept, editgroup_id, + description, + extra, context.x_span_id.unwrap_or(String::from("<none>")).clone() ); Box::new(futures::failed("Generic failure".into())) diff --git a/rust/fatcat-api-spec/src/client.rs b/rust/fatcat-api-spec/src/client.rs index a3e97fb3..6d654146 100644 --- a/rust/fatcat-api-spec/src/client.rs +++ b/rust/fatcat-api-spec/src/client.rs @@ -272,17 +272,23 @@ impl Api for Client { param_entity_list: &Vec<models::ContainerEntity>, param_autoaccept: Option<bool>, param_editgroup_id: Option<String>, + param_description: Option<String>, + param_extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send> { // Query parameters let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string())); let query_editgroup_id = param_editgroup_id.map_or_else(String::new, |query| format!("editgroup_id={editgroup_id}&", editgroup_id = query.to_string())); + let query_description = param_description.map_or_else(String::new, |query| format!("description={description}&", description = query.to_string())); + let query_extra = param_extra.map_or_else(String::new, |query| format!("extra={extra}&", extra = query.to_string())); let url = format!( - "{}/v0/container/batch?{autoaccept}{editgroup_id}", + "{}/v0/container/batch?{autoaccept}{editgroup_id}{description}{extra}", self.base_path, autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET), - editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET) + editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET), + description = utf8_percent_encode(&query_description, QUERY_ENCODE_SET), + extra = utf8_percent_encode(&query_extra, QUERY_ENCODE_SET) ); let body = serde_json::to_string(¶m_entity_list).expect("impossible to fail to serialize"); @@ -1167,17 +1173,23 @@ impl Api for Client { param_entity_list: &Vec<models::CreatorEntity>, param_autoaccept: Option<bool>, param_editgroup_id: Option<String>, + param_description: Option<String>, + param_extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send> { // Query parameters let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string())); let query_editgroup_id = param_editgroup_id.map_or_else(String::new, |query| format!("editgroup_id={editgroup_id}&", editgroup_id = query.to_string())); + let query_description = param_description.map_or_else(String::new, |query| format!("description={description}&", description = query.to_string())); + let query_extra = param_extra.map_or_else(String::new, |query| format!("extra={extra}&", extra = query.to_string())); let url = format!( - "{}/v0/creator/batch?{autoaccept}{editgroup_id}", + "{}/v0/creator/batch?{autoaccept}{editgroup_id}{description}{extra}", self.base_path, autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET), - editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET) + editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET), + description = utf8_percent_encode(&query_description, QUERY_ENCODE_SET), + extra = utf8_percent_encode(&query_extra, QUERY_ENCODE_SET) ); let body = serde_json::to_string(¶m_entity_list).expect("impossible to fail to serialize"); @@ -3364,17 +3376,23 @@ impl Api for Client { param_entity_list: &Vec<models::FileEntity>, param_autoaccept: Option<bool>, param_editgroup_id: Option<String>, + param_description: Option<String>, + param_extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send> { // Query parameters let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string())); let query_editgroup_id = param_editgroup_id.map_or_else(String::new, |query| format!("editgroup_id={editgroup_id}&", editgroup_id = query.to_string())); + let query_description = param_description.map_or_else(String::new, |query| format!("description={description}&", description = query.to_string())); + let query_extra = param_extra.map_or_else(String::new, |query| format!("extra={extra}&", extra = query.to_string())); let url = format!( - "{}/v0/file/batch?{autoaccept}{editgroup_id}", + "{}/v0/file/batch?{autoaccept}{editgroup_id}{description}{extra}", self.base_path, autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET), - editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET) + editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET), + description = utf8_percent_encode(&query_description, QUERY_ENCODE_SET), + extra = utf8_percent_encode(&query_extra, QUERY_ENCODE_SET) ); let body = serde_json::to_string(¶m_entity_list).expect("impossible to fail to serialize"); @@ -4256,17 +4274,23 @@ impl Api for Client { param_entity_list: &Vec<models::FilesetEntity>, param_autoaccept: Option<bool>, param_editgroup_id: Option<String>, + param_description: Option<String>, + param_extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateFilesetBatchResponse, Error = ApiError> + Send> { // Query parameters let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string())); let query_editgroup_id = param_editgroup_id.map_or_else(String::new, |query| format!("editgroup_id={editgroup_id}&", editgroup_id = query.to_string())); + let query_description = param_description.map_or_else(String::new, |query| format!("description={description}&", description = query.to_string())); + let query_extra = param_extra.map_or_else(String::new, |query| format!("extra={extra}&", extra = query.to_string())); let url = format!( - "{}/v0/fileset/batch?{autoaccept}{editgroup_id}", + "{}/v0/fileset/batch?{autoaccept}{editgroup_id}{description}{extra}", self.base_path, autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET), - editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET) + editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET), + description = utf8_percent_encode(&query_description, QUERY_ENCODE_SET), + extra = utf8_percent_encode(&query_extra, QUERY_ENCODE_SET) ); let body = serde_json::to_string(¶m_entity_list).expect("impossible to fail to serialize"); @@ -5071,17 +5095,23 @@ impl Api for Client { param_entity_list: &Vec<models::ReleaseEntity>, param_autoaccept: Option<bool>, param_editgroup_id: Option<String>, + param_description: Option<String>, + param_extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send> { // Query parameters let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string())); let query_editgroup_id = param_editgroup_id.map_or_else(String::new, |query| format!("editgroup_id={editgroup_id}&", editgroup_id = query.to_string())); + let query_description = param_description.map_or_else(String::new, |query| format!("description={description}&", description = query.to_string())); + let query_extra = param_extra.map_or_else(String::new, |query| format!("extra={extra}&", extra = query.to_string())); let url = format!( - "{}/v0/release/batch?{autoaccept}{editgroup_id}", + "{}/v0/release/batch?{autoaccept}{editgroup_id}{description}{extra}", self.base_path, autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET), - editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET) + editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET), + description = utf8_percent_encode(&query_description, QUERY_ENCODE_SET), + extra = utf8_percent_encode(&query_extra, QUERY_ENCODE_SET) ); let body = serde_json::to_string(¶m_entity_list).expect("impossible to fail to serialize"); @@ -5988,6 +6018,8 @@ impl Api for Client { param_pmid: Option<String>, param_pmcid: Option<String>, param_core_id: Option<String>, + param_arxiv_id: Option<String>, + param_jstor_id: Option<String>, param_expand: Option<String>, param_hide: Option<String>, context: &Context, @@ -5999,11 +6031,13 @@ impl Api for Client { let query_pmid = param_pmid.map_or_else(String::new, |query| format!("pmid={pmid}&", pmid = query.to_string())); let query_pmcid = param_pmcid.map_or_else(String::new, |query| format!("pmcid={pmcid}&", pmcid = query.to_string())); let query_core_id = param_core_id.map_or_else(String::new, |query| format!("core_id={core_id}&", core_id = query.to_string())); + let query_arxiv_id = param_arxiv_id.map_or_else(String::new, |query| format!("arxiv_id={arxiv_id}&", arxiv_id = query.to_string())); + let query_jstor_id = param_jstor_id.map_or_else(String::new, |query| format!("jstor_id={jstor_id}&", jstor_id = query.to_string())); let query_expand = param_expand.map_or_else(String::new, |query| format!("expand={expand}&", expand = query.to_string())); let query_hide = param_hide.map_or_else(String::new, |query| format!("hide={hide}&", hide = query.to_string())); let url = format!( - "{}/v0/release/lookup?{doi}{wikidata_qid}{isbn13}{pmid}{pmcid}{core_id}{expand}{hide}", + "{}/v0/release/lookup?{doi}{wikidata_qid}{isbn13}{pmid}{pmcid}{core_id}{arxiv_id}{jstor_id}{expand}{hide}", self.base_path, doi = utf8_percent_encode(&query_doi, QUERY_ENCODE_SET), wikidata_qid = utf8_percent_encode(&query_wikidata_qid, QUERY_ENCODE_SET), @@ -6011,6 +6045,8 @@ impl Api for Client { pmid = utf8_percent_encode(&query_pmid, QUERY_ENCODE_SET), pmcid = utf8_percent_encode(&query_pmcid, QUERY_ENCODE_SET), core_id = utf8_percent_encode(&query_core_id, QUERY_ENCODE_SET), + arxiv_id = utf8_percent_encode(&query_arxiv_id, QUERY_ENCODE_SET), + jstor_id = utf8_percent_encode(&query_jstor_id, QUERY_ENCODE_SET), expand = utf8_percent_encode(&query_expand, QUERY_ENCODE_SET), hide = utf8_percent_encode(&query_hide, QUERY_ENCODE_SET) ); @@ -6272,17 +6308,23 @@ impl Api for Client { param_entity_list: &Vec<models::WebcaptureEntity>, param_autoaccept: Option<bool>, param_editgroup_id: Option<String>, + param_description: Option<String>, + param_extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateWebcaptureBatchResponse, Error = ApiError> + Send> { // Query parameters let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string())); let query_editgroup_id = param_editgroup_id.map_or_else(String::new, |query| format!("editgroup_id={editgroup_id}&", editgroup_id = query.to_string())); + let query_description = param_description.map_or_else(String::new, |query| format!("description={description}&", description = query.to_string())); + let query_extra = param_extra.map_or_else(String::new, |query| format!("extra={extra}&", extra = query.to_string())); let url = format!( - "{}/v0/webcapture/batch?{autoaccept}{editgroup_id}", + "{}/v0/webcapture/batch?{autoaccept}{editgroup_id}{description}{extra}", self.base_path, autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET), - editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET) + editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET), + description = utf8_percent_encode(&query_description, QUERY_ENCODE_SET), + extra = utf8_percent_encode(&query_extra, QUERY_ENCODE_SET) ); let body = serde_json::to_string(¶m_entity_list).expect("impossible to fail to serialize"); @@ -6993,17 +7035,23 @@ impl Api for Client { param_entity_list: &Vec<models::WorkEntity>, param_autoaccept: Option<bool>, param_editgroup_id: Option<String>, + param_description: Option<String>, + param_extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send> { // Query parameters let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string())); let query_editgroup_id = param_editgroup_id.map_or_else(String::new, |query| format!("editgroup_id={editgroup_id}&", editgroup_id = query.to_string())); + let query_description = param_description.map_or_else(String::new, |query| format!("description={description}&", description = query.to_string())); + let query_extra = param_extra.map_or_else(String::new, |query| format!("extra={extra}&", extra = query.to_string())); let url = format!( - "{}/v0/work/batch?{autoaccept}{editgroup_id}", + "{}/v0/work/batch?{autoaccept}{editgroup_id}{description}{extra}", self.base_path, autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET), - editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET) + editgroup_id = utf8_percent_encode(&query_editgroup_id, QUERY_ENCODE_SET), + description = utf8_percent_encode(&query_description, QUERY_ENCODE_SET), + extra = utf8_percent_encode(&query_extra, QUERY_ENCODE_SET) ); let body = serde_json::to_string(¶m_entity_list).expect("impossible to fail to serialize"); diff --git a/rust/fatcat-api-spec/src/lib.rs b/rust/fatcat-api-spec/src/lib.rs index 9585f1c0..59129869 100644 --- a/rust/fatcat-api-spec/src/lib.rs +++ b/rust/fatcat-api-spec/src/lib.rs @@ -1345,6 +1345,8 @@ pub trait Api { entity_list: &Vec<models::ContainerEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send>; @@ -1380,6 +1382,8 @@ pub trait Api { entity_list: &Vec<models::CreatorEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send>; @@ -1473,6 +1477,8 @@ pub trait Api { entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send>; @@ -1509,6 +1515,8 @@ pub trait Api { entity_list: &Vec<models::FilesetEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateFilesetBatchResponse, Error = ApiError> + Send>; @@ -1535,6 +1543,8 @@ pub trait Api { entity_list: &Vec<models::ReleaseEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send>; @@ -1568,6 +1578,8 @@ pub trait Api { pmid: Option<String>, pmcid: Option<String>, core_id: Option<String>, + arxiv_id: Option<String>, + jstor_id: Option<String>, expand: Option<String>, hide: Option<String>, context: &Context, @@ -1582,6 +1594,8 @@ pub trait Api { entity_list: &Vec<models::WebcaptureEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateWebcaptureBatchResponse, Error = ApiError> + Send>; @@ -1606,6 +1620,8 @@ pub trait Api { entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send>; @@ -1637,6 +1653,8 @@ pub trait ApiNoContext { entity_list: &Vec<models::ContainerEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send>; fn delete_container(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteContainerResponse, Error = ApiError> + Send>; @@ -1670,6 +1688,8 @@ pub trait ApiNoContext { entity_list: &Vec<models::CreatorEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send>; fn delete_creator(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteCreatorResponse, Error = ApiError> + Send>; @@ -1742,7 +1762,14 @@ pub trait ApiNoContext { fn create_file(&self, entity: models::FileEntity, editgroup_id: String) -> Box<Future<Item = CreateFileResponse, Error = ApiError> + Send>; - fn create_file_batch(&self, entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send>; + fn create_file_batch( + &self, + entity_list: &Vec<models::FileEntity>, + autoaccept: Option<bool>, + editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, + ) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send>; fn delete_file(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteFileResponse, Error = ApiError> + Send>; @@ -1776,6 +1803,8 @@ pub trait ApiNoContext { entity_list: &Vec<models::FilesetEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateFilesetBatchResponse, Error = ApiError> + Send>; fn delete_fileset(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteFilesetResponse, Error = ApiError> + Send>; @@ -1801,6 +1830,8 @@ pub trait ApiNoContext { entity_list: &Vec<models::ReleaseEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send>; fn create_work(&self, entity: models::WorkEntity, editgroup_id: String) -> Box<Future<Item = CreateWorkResponse, Error = ApiError> + Send>; @@ -1833,6 +1864,8 @@ pub trait ApiNoContext { pmid: Option<String>, pmcid: Option<String>, core_id: Option<String>, + arxiv_id: Option<String>, + jstor_id: Option<String>, expand: Option<String>, hide: Option<String>, ) -> Box<Future<Item = LookupReleaseResponse, Error = ApiError> + Send>; @@ -1846,6 +1879,8 @@ pub trait ApiNoContext { entity_list: &Vec<models::WebcaptureEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateWebcaptureBatchResponse, Error = ApiError> + Send>; fn delete_webcapture(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteWebcaptureResponse, Error = ApiError> + Send>; @@ -1864,7 +1899,14 @@ pub trait ApiNoContext { fn update_webcapture(&self, ident: String, entity: models::WebcaptureEntity, editgroup_id: String) -> Box<Future<Item = UpdateWebcaptureResponse, Error = ApiError> + Send>; - fn create_work_batch(&self, entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send>; + fn create_work_batch( + &self, + entity_list: &Vec<models::WorkEntity>, + autoaccept: Option<bool>, + editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, + ) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send>; fn delete_work(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteWorkResponse, Error = ApiError> + Send>; @@ -1910,8 +1952,10 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> { entity_list: &Vec<models::ContainerEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send> { - self.api().create_container_batch(entity_list, autoaccept, editgroup_id, &self.context()) + self.api().create_container_batch(entity_list, autoaccept, editgroup_id, description, extra, &self.context()) } fn delete_container(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteContainerResponse, Error = ApiError> + Send> { @@ -1965,8 +2009,10 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> { entity_list: &Vec<models::CreatorEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send> { - self.api().create_creator_batch(entity_list, autoaccept, editgroup_id, &self.context()) + self.api().create_creator_batch(entity_list, autoaccept, editgroup_id, description, extra, &self.context()) } fn delete_creator(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteCreatorResponse, Error = ApiError> + Send> { @@ -2091,8 +2137,15 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> { self.api().create_file(entity, editgroup_id, &self.context()) } - fn create_file_batch(&self, entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send> { - self.api().create_file_batch(entity_list, autoaccept, editgroup_id, &self.context()) + fn create_file_batch( + &self, + entity_list: &Vec<models::FileEntity>, + autoaccept: Option<bool>, + editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, + ) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send> { + self.api().create_file_batch(entity_list, autoaccept, editgroup_id, description, extra, &self.context()) } fn delete_file(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteFileResponse, Error = ApiError> + Send> { @@ -2147,8 +2200,10 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> { entity_list: &Vec<models::FilesetEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateFilesetBatchResponse, Error = ApiError> + Send> { - self.api().create_fileset_batch(entity_list, autoaccept, editgroup_id, &self.context()) + self.api().create_fileset_batch(entity_list, autoaccept, editgroup_id, description, extra, &self.context()) } fn delete_fileset(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteFilesetResponse, Error = ApiError> + Send> { @@ -2192,8 +2247,10 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> { entity_list: &Vec<models::ReleaseEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send> { - self.api().create_release_batch(entity_list, autoaccept, editgroup_id, &self.context()) + self.api().create_release_batch(entity_list, autoaccept, editgroup_id, description, extra, &self.context()) } fn create_work(&self, entity: models::WorkEntity, editgroup_id: String) -> Box<Future<Item = CreateWorkResponse, Error = ApiError> + Send> { @@ -2248,10 +2305,13 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> { pmid: Option<String>, pmcid: Option<String>, core_id: Option<String>, + arxiv_id: Option<String>, + jstor_id: Option<String>, expand: Option<String>, hide: Option<String>, ) -> Box<Future<Item = LookupReleaseResponse, Error = ApiError> + Send> { - self.api().lookup_release(doi, wikidata_qid, isbn13, pmid, pmcid, core_id, expand, hide, &self.context()) + self.api() + .lookup_release(doi, wikidata_qid, isbn13, pmid, pmcid, core_id, arxiv_id, jstor_id, expand, hide, &self.context()) } fn update_release(&self, ident: String, entity: models::ReleaseEntity, editgroup_id: String) -> Box<Future<Item = UpdateReleaseResponse, Error = ApiError> + Send> { @@ -2267,8 +2327,10 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> { entity_list: &Vec<models::WebcaptureEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, ) -> Box<Future<Item = CreateWebcaptureBatchResponse, Error = ApiError> + Send> { - self.api().create_webcapture_batch(entity_list, autoaccept, editgroup_id, &self.context()) + self.api().create_webcapture_batch(entity_list, autoaccept, editgroup_id, description, extra, &self.context()) } fn delete_webcapture(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteWebcaptureResponse, Error = ApiError> + Send> { @@ -2303,8 +2365,15 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> { self.api().update_webcapture(ident, entity, editgroup_id, &self.context()) } - fn create_work_batch(&self, entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>, editgroup_id: Option<String>) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send> { - self.api().create_work_batch(entity_list, autoaccept, editgroup_id, &self.context()) + fn create_work_batch( + &self, + entity_list: &Vec<models::WorkEntity>, + autoaccept: Option<bool>, + editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, + ) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send> { + self.api().create_work_batch(entity_list, autoaccept, editgroup_id, description, extra, &self.context()) } fn delete_work(&self, ident: String, editgroup_id: String) -> Box<Future<Item = DeleteWorkResponse, Error = ApiError> + Send> { diff --git a/rust/fatcat-api-spec/src/models.rs b/rust/fatcat-api-spec/src/models.rs index 5d05b737..d6e6e07f 100644 --- a/rust/fatcat-api-spec/src/models.rs +++ b/rust/fatcat-api-spec/src/models.rs @@ -79,14 +79,6 @@ impl ChangelogEntry { #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ContainerEntity { - #[serde(rename = "coden")] - #[serde(skip_serializing_if = "Option::is_none")] - pub coden: Option<String>, - - #[serde(rename = "abbrev")] - #[serde(skip_serializing_if = "Option::is_none")] - pub abbrev: Option<String>, - #[serde(rename = "wikidata_qid")] #[serde(skip_serializing_if = "Option::is_none")] pub wikidata_qid: Option<String>, @@ -99,6 +91,11 @@ pub struct ContainerEntity { #[serde(skip_serializing_if = "Option::is_none")] pub publisher: Option<String>, + /// Eg, 'journal' + #[serde(rename = "container_type")] + #[serde(skip_serializing_if = "Option::is_none")] + pub container_type: Option<String>, + /// Required for valid entities #[serde(rename = "name")] #[serde(skip_serializing_if = "Option::is_none")] @@ -136,11 +133,10 @@ pub struct ContainerEntity { impl ContainerEntity { pub fn new() -> ContainerEntity { ContainerEntity { - coden: None, - abbrev: None, wikidata_qid: None, issnl: None, publisher: None, + container_type: None, name: None, edit_extra: None, extra: None, @@ -709,13 +705,18 @@ pub struct ReleaseContrib { #[serde(skip_serializing_if = "Option::is_none")] pub raw_name: Option<String>, - #[serde(rename = "extra")] - #[serde(skip_serializing_if = "Option::is_none")] - pub extra: Option<serde_json::Value>, - #[serde(rename = "role")] #[serde(skip_serializing_if = "Option::is_none")] pub role: Option<String>, + + /// Raw affiliation string as displayed in text + #[serde(rename = "raw_affiliation")] + #[serde(skip_serializing_if = "Option::is_none")] + pub raw_affiliation: Option<String>, + + #[serde(rename = "extra")] + #[serde(skip_serializing_if = "Option::is_none")] + pub extra: Option<serde_json::Value>, } impl ReleaseContrib { @@ -725,8 +726,9 @@ impl ReleaseContrib { creator_id: None, creator: None, raw_name: None, - extra: None, role: None, + raw_affiliation: None, + extra: None, } } } @@ -745,6 +747,11 @@ pub struct ReleaseEntity { #[serde(skip_serializing_if = "Option::is_none")] pub contribs: Option<Vec<models::ReleaseContrib>>, + /// Short version of license name. Eg, 'CC-BY' + #[serde(rename = "license_slug")] + #[serde(skip_serializing_if = "Option::is_none")] + pub license_slug: Option<String>, + /// Two-letter RFC1766/ISO639-1 language code, with extensions #[serde(rename = "language")] #[serde(skip_serializing_if = "Option::is_none")] @@ -766,6 +773,14 @@ pub struct ReleaseEntity { #[serde(skip_serializing_if = "Option::is_none")] pub volume: Option<String>, + #[serde(rename = "jstor_id")] + #[serde(skip_serializing_if = "Option::is_none")] + pub jstor_id: Option<String>, + + #[serde(rename = "arxiv_id")] + #[serde(skip_serializing_if = "Option::is_none")] + pub arxiv_id: Option<String>, + #[serde(rename = "core_id")] #[serde(skip_serializing_if = "Option::is_none")] pub core_id: Option<String>, @@ -834,7 +849,12 @@ pub struct ReleaseEntity { #[serde(skip_serializing_if = "Option::is_none")] pub work_id: Option<String>, - /// Required for valid entities + /// Title in original language (or, the language of the full text of this release) + #[serde(rename = "original_title")] + #[serde(skip_serializing_if = "Option::is_none")] + pub original_title: Option<String>, + + /// Required for valid entities. The title used in citations and for display; usually English #[serde(rename = "title")] #[serde(skip_serializing_if = "Option::is_none")] pub title: Option<String>, @@ -874,11 +894,14 @@ impl ReleaseEntity { abstracts: None, refs: None, contribs: None, + license_slug: None, language: None, publisher: None, pages: None, issue: None, volume: None, + jstor_id: None, + arxiv_id: None, core_id: None, pmcid: None, pmid: None, @@ -895,6 +918,7 @@ impl ReleaseEntity { files: None, container: None, work_id: None, + original_title: None, title: None, state: None, ident: None, @@ -1008,6 +1032,7 @@ pub struct WebcaptureEntity { #[serde(skip_serializing_if = "Option::is_none")] pub release_ids: Option<Vec<String>>, + /// same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Can be the earliest or average of CDX timestamps if that makes sense. #[serde(rename = "timestamp")] #[serde(skip_serializing_if = "Option::is_none")] pub timestamp: Option<chrono::DateTime<chrono::Utc>>, @@ -1091,8 +1116,9 @@ pub struct WebcaptureEntityCdx { #[serde(rename = "surt")] pub surt: String, + /// UTC, 'Z'-terminated, second (or better) precision #[serde(rename = "timestamp")] - pub timestamp: String, + pub timestamp: chrono::DateTime<chrono::Utc>, #[serde(rename = "url")] pub url: String, @@ -1114,7 +1140,7 @@ pub struct WebcaptureEntityCdx { } impl WebcaptureEntityCdx { - pub fn new(surt: String, timestamp: String, url: String, sha1: String) -> WebcaptureEntityCdx { + pub fn new(surt: String, timestamp: chrono::DateTime<chrono::Utc>, url: String, sha1: String) -> WebcaptureEntityCdx { WebcaptureEntityCdx { surt: surt, timestamp: timestamp, diff --git a/rust/fatcat-api-spec/src/server.rs b/rust/fatcat-api-spec/src/server.rs index 8b616959..af13948e 100644 --- a/rust/fatcat-api-spec/src/server.rs +++ b/rust/fatcat-api-spec/src/server.rs @@ -262,8 +262,15 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok()); + let param_autoaccept = query_params + .get("autoaccept") + .and_then(|list| list.first()) + .and_then(|x| Some(x.to_lowercase().parse::<bool>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected boolean)".to_string())))?; let param_editgroup_id = query_params.get("editgroup_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_description = query_params.get("description").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_extra = query_params.get("extra").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); // Body parameters (note that non-required body parameters will ignore garbage // values, rather than causing a 400 response). Produce warning header and logs for @@ -290,7 +297,10 @@ where }; let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?; - match api.create_container_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, context).wait() { + match api + .create_container_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, param_description, param_extra, context) + .wait() + { Ok(rsp) => match rsp { CreateContainerBatchResponse::CreatedEntities(body) => { let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize"); @@ -837,7 +847,12 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; match api.get_container_history(param_ident, param_limit, context).wait() { Ok(rsp) => match rsp { @@ -1488,8 +1503,15 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok()); + let param_autoaccept = query_params + .get("autoaccept") + .and_then(|list| list.first()) + .and_then(|x| Some(x.to_lowercase().parse::<bool>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected boolean)".to_string())))?; let param_editgroup_id = query_params.get("editgroup_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_description = query_params.get("description").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_extra = query_params.get("extra").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); // Body parameters (note that non-required body parameters will ignore garbage // values, rather than causing a 400 response). Produce warning header and logs for @@ -1516,7 +1538,10 @@ where }; let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?; - match api.create_creator_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, context).wait() { + match api + .create_creator_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, param_description, param_extra, context) + .wait() + { Ok(rsp) => match rsp { CreateCreatorBatchResponse::CreatedEntities(body) => { let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize"); @@ -2063,7 +2088,12 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; match api.get_creator_history(param_ident, param_limit, context).wait() { Ok(rsp) => match rsp { @@ -2901,9 +2931,24 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); let param_expand = query_params.get("expand").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); - let param_before = query_params.get("before").and_then(|list| list.first()).and_then(|x| x.parse::<chrono::DateTime<chrono::Utc>>().ok()); - let param_since = query_params.get("since").and_then(|list| list.first()).and_then(|x| x.parse::<chrono::DateTime<chrono::Utc>>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; + let param_before = query_params + .get("before") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<chrono::DateTime<chrono::Utc>>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected UTC datetime in ISO/RFC format)".to_string())))?; + let param_since = query_params + .get("since") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<chrono::DateTime<chrono::Utc>>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected UTC datetime in ISO/RFC format)".to_string())))?; match api.get_editgroups_reviewable(param_expand, param_limit, param_before, param_since, context).wait() { Ok(rsp) => match rsp { @@ -3085,9 +3130,24 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); - let param_before = query_params.get("before").and_then(|list| list.first()).and_then(|x| x.parse::<chrono::DateTime<chrono::Utc>>().ok()); - let param_since = query_params.get("since").and_then(|list| list.first()).and_then(|x| x.parse::<chrono::DateTime<chrono::Utc>>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; + let param_before = query_params + .get("before") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<chrono::DateTime<chrono::Utc>>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected UTC datetime in ISO/RFC format)".to_string())))?; + let param_since = query_params + .get("since") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<chrono::DateTime<chrono::Utc>>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected UTC datetime in ISO/RFC format)".to_string())))?; match api.get_editor_editgroups(param_editor_id, param_limit, param_before, param_since, context).wait() { Ok(rsp) => match rsp { @@ -3182,7 +3242,12 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_submit = query_params.get("submit").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok()); + let param_submit = query_params + .get("submit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.to_lowercase().parse::<bool>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected boolean)".to_string())))?; // Body parameters (note that non-required body parameters will ignore garbage // values, rather than causing a 400 response). Produce warning header and logs for @@ -3882,7 +3947,12 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; match api.get_changelog(param_limit, context).wait() { Ok(rsp) => match rsp { @@ -4261,9 +4331,24 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); - let param_before = query_params.get("before").and_then(|list| list.first()).and_then(|x| x.parse::<chrono::DateTime<chrono::Utc>>().ok()); - let param_since = query_params.get("since").and_then(|list| list.first()).and_then(|x| x.parse::<chrono::DateTime<chrono::Utc>>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; + let param_before = query_params + .get("before") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<chrono::DateTime<chrono::Utc>>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected UTC datetime in ISO/RFC format)".to_string())))?; + let param_since = query_params + .get("since") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<chrono::DateTime<chrono::Utc>>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected UTC datetime in ISO/RFC format)".to_string())))?; match api.get_editor_annotations(param_editor_id, param_limit, param_before, param_since, context).wait() { Ok(rsp) => match rsp { @@ -4512,8 +4597,15 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok()); + let param_autoaccept = query_params + .get("autoaccept") + .and_then(|list| list.first()) + .and_then(|x| Some(x.to_lowercase().parse::<bool>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected boolean)".to_string())))?; let param_editgroup_id = query_params.get("editgroup_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_description = query_params.get("description").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_extra = query_params.get("extra").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); // Body parameters (note that non-required body parameters will ignore garbage // values, rather than causing a 400 response). Produce warning header and logs for @@ -4540,7 +4632,10 @@ where }; let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?; - match api.create_file_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, context).wait() { + match api + .create_file_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, param_description, param_extra, context) + .wait() + { Ok(rsp) => match rsp { CreateFileBatchResponse::CreatedEntities(body) => { let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize"); @@ -5087,7 +5182,12 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; match api.get_file_history(param_ident, param_limit, context).wait() { Ok(rsp) => match rsp { @@ -5739,8 +5839,15 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok()); + let param_autoaccept = query_params + .get("autoaccept") + .and_then(|list| list.first()) + .and_then(|x| Some(x.to_lowercase().parse::<bool>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected boolean)".to_string())))?; let param_editgroup_id = query_params.get("editgroup_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_description = query_params.get("description").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_extra = query_params.get("extra").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); // Body parameters (note that non-required body parameters will ignore garbage // values, rather than causing a 400 response). Produce warning header and logs for @@ -5767,7 +5874,10 @@ where }; let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?; - match api.create_fileset_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, context).wait() { + match api + .create_fileset_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, param_description, param_extra, context) + .wait() + { Ok(rsp) => match rsp { CreateFilesetBatchResponse::CreatedEntities(body) => { let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize"); @@ -6314,7 +6424,12 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; match api.get_fileset_history(param_ident, param_limit, context).wait() { Ok(rsp) => match rsp { @@ -6884,8 +6999,15 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok()); + let param_autoaccept = query_params + .get("autoaccept") + .and_then(|list| list.first()) + .and_then(|x| Some(x.to_lowercase().parse::<bool>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected boolean)".to_string())))?; let param_editgroup_id = query_params.get("editgroup_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_description = query_params.get("description").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_extra = query_params.get("extra").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); // Body parameters (note that non-required body parameters will ignore garbage // values, rather than causing a 400 response). Produce warning header and logs for @@ -6912,7 +7034,10 @@ where }; let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?; - match api.create_release_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, context).wait() { + match api + .create_release_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, param_description, param_extra, context) + .wait() + { Ok(rsp) => match rsp { CreateReleaseBatchResponse::CreatedEntities(body) => { let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize"); @@ -7791,7 +7916,12 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; match api.get_release_history(param_ident, param_limit, context).wait() { Ok(rsp) => match rsp { @@ -8151,11 +8281,25 @@ where let param_pmid = query_params.get("pmid").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); let param_pmcid = query_params.get("pmcid").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); let param_core_id = query_params.get("core_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_arxiv_id = query_params.get("arxiv_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_jstor_id = query_params.get("jstor_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); let param_expand = query_params.get("expand").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); let param_hide = query_params.get("hide").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); match api - .lookup_release(param_doi, param_wikidata_qid, param_isbn13, param_pmid, param_pmcid, param_core_id, param_expand, param_hide, context) + .lookup_release( + param_doi, + param_wikidata_qid, + param_isbn13, + param_pmid, + param_pmcid, + param_core_id, + param_arxiv_id, + param_jstor_id, + param_expand, + param_hide, + context, + ) .wait() { Ok(rsp) => match rsp { @@ -8542,8 +8686,15 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok()); + let param_autoaccept = query_params + .get("autoaccept") + .and_then(|list| list.first()) + .and_then(|x| Some(x.to_lowercase().parse::<bool>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected boolean)".to_string())))?; let param_editgroup_id = query_params.get("editgroup_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_description = query_params.get("description").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_extra = query_params.get("extra").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); // Body parameters (note that non-required body parameters will ignore garbage // values, rather than causing a 400 response). Produce warning header and logs for @@ -8570,7 +8721,10 @@ where }; let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?; - match api.create_webcapture_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, context).wait() { + match api + .create_webcapture_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, param_description, param_extra, context) + .wait() + { Ok(rsp) => match rsp { CreateWebcaptureBatchResponse::CreatedEntities(body) => { let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize"); @@ -9117,7 +9271,12 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; match api.get_webcapture_history(param_ident, param_limit, context).wait() { Ok(rsp) => match rsp { @@ -9541,8 +9700,15 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok()); + let param_autoaccept = query_params + .get("autoaccept") + .and_then(|list| list.first()) + .and_then(|x| Some(x.to_lowercase().parse::<bool>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected boolean)".to_string())))?; let param_editgroup_id = query_params.get("editgroup_id").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_description = query_params.get("description").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); + let param_extra = query_params.get("extra").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok()); // Body parameters (note that non-required body parameters will ignore garbage // values, rather than causing a 400 response). Produce warning header and logs for @@ -9569,7 +9735,10 @@ where }; let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?; - match api.create_work_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, context).wait() { + match api + .create_work_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup_id, param_description, param_extra, context) + .wait() + { Ok(rsp) => match rsp { CreateWorkBatchResponse::CreatedEntities(body) => { let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize"); @@ -10116,7 +10285,12 @@ where // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response) let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default(); - let param_limit = query_params.get("limit").and_then(|list| list.first()).and_then(|x| x.parse::<i64>().ok()); + let param_limit = query_params + .get("limit") + .and_then(|list| list.first()) + .and_then(|x| Some(x.parse::<i64>())) + .map_or_else(|| Ok(None), |x| x.map(|v| Some(v))) + .map_err(|x| Response::with((status::BadRequest, "unparsable query parameter (expected integer)".to_string())))?; match api.get_work_history(param_ident, param_limit, context).wait() { Ok(rsp) => match rsp { diff --git a/rust/migrations/2019-01-01-000000_init/down.sql b/rust/migrations/2019-01-01-000000_init/down.sql index 30e712e3..e238a690 100644 --- a/rust/migrations/2019-01-01-000000_init/down.sql +++ b/rust/migrations/2019-01-01-000000_init/down.sql @@ -2,6 +2,7 @@ -- in opposite order as up.sql DROP TABLE IF EXISTS release_contrib CASCADE; +DROP TABLE IF EXISTS refs_blob CASCADE; DROP TABLE IF EXISTS release_ref CASCADE; DROP TABLE IF EXISTS file_rev_release CASCADE; DROP TABLE IF EXISTS fileset_rev_release CASCADE; diff --git a/rust/migrations/2019-01-01-000000_init/up.sql b/rust/migrations/2019-01-01-000000_init/up.sql index b4c7a684..2bb3f4ec 100644 --- a/rust/migrations/2019-01-01-000000_init/up.sql +++ b/rust/migrations/2019-01-01-000000_init/up.sql @@ -1,4 +1,5 @@ --- written for Postgres 9.6 with OSSP extension for UUIDs -- ... but actually runs on Postgres 10 in qa/production +-- written for Postgres 9.6 with OSSP extension for UUIDs +-- ... but actually runs on Postgres 11 in qa/production/tests -- Previously VARCHAR and fixed-size CHAR was used in this schema for specific -- columns (especially fixed-size external identifiers, and hashes). This was @@ -47,7 +48,7 @@ CREATE TABLE editgroup ( created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, submitted TIMESTAMP WITH TIME ZONE, is_accepted BOOLEAN DEFAULT false NOT NULL, - description TEXT, + description TEXT CHECK (octet_length(description) >= 1), extra_json JSONB ); @@ -60,7 +61,7 @@ CREATE TABLE editgroup_annotation ( editgroup_id UUID REFERENCES editgroup(id) NOT NULL, editor_id UUID REFERENCES editor(id) NOT NULL, created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, - comment_markdown TEXT, + comment_markdown TEXT CHECK (octet_length(comment_markdown) >= 1), extra_json JSONB ); @@ -79,7 +80,13 @@ CREATE INDEX changelog_editgroup_idx ON changelog(editgroup_id); CREATE TABLE abstracts ( -- fixed size hash (in hex). TODO: switch to bytes sha1 TEXT PRIMARY KEY CHECK (octet_length(sha1) = 40), - content TEXT NOT NULL + content TEXT NOT NULL CHECK (octet_length(content) >= 8) +); + +CREATE TABLE refs_blob ( + -- fixed size hash (in hex). TODO: switch to bytes + sha1 TEXT PRIMARY KEY CHECK (octet_length(sha1) = 40), + refs_json JSONB NOT NULL ); -------------------- Creators ----------------------------------------------- @@ -87,9 +94,9 @@ CREATE TABLE creator_rev ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), extra_json JSONB, - display_name TEXT NOT NULL, - given_name TEXT, - surname TEXT, + display_name TEXT NOT NULL CHECK (octet_length(display_name) >= 1), + given_name TEXT CHECK (octet_length(given_name) >= 1), + surname TEXT CHECK (octet_length(surname) >= 1), -- fixed size identifier orcid TEXT CHECK(octet_length(orcid) = 19), -- limited size for data quality @@ -132,15 +139,13 @@ CREATE TABLE container_rev ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), extra_json JSONB, - name TEXT NOT NULL, - publisher TEXT, + name TEXT NOT NULL CHECK (octet_length(name) >= 1), + container_type TEXT, + publisher TEXT CHECK (octet_length(publisher) >= 1), -- fixed size identifier issnl TEXT CHECK(octet_length(issnl) = 9), -- limited size for data quality - wikidata_qid TEXT CHECK(octet_length(wikidata_qid) <= 12), - abbrev TEXT, - -- limited size for data quality - coden TEXT CHECK(octet_length(coden) <= 6) + wikidata_qid TEXT CHECK(octet_length(wikidata_qid) <= 12) ); CREATE INDEX container_rev_issnl_idx ON container_rev(issnl); @@ -175,10 +180,10 @@ CREATE TABLE file_rev ( size_bytes BIGINT, -- fixed size hashes (in hex). TODO: switch to binary type type - sha1 TEXT CHECK(octet_length(sha1) = 40), - sha256 TEXT CHECK(octet_length(sha256) = 64), - md5 TEXT CHECK(octet_length(md5) = 32), - mimetype TEXT + sha1 TEXT CHECK (octet_length(sha1) = 40), + sha256 TEXT CHECK (octet_length(sha256) = 64), + md5 TEXT CHECK (octet_length(md5) = 32), + mimetype TEXT CHECK (octet_length(mimetype) >= 3) ); CREATE INDEX file_rev_sha1_idx ON file_rev(sha1); @@ -188,8 +193,8 @@ CREATE INDEX file_rev_sha256_idx ON file_rev(sha256); CREATE TABLE file_rev_url ( id BIGSERIAL PRIMARY KEY, file_rev UUID REFERENCES file_rev(id) NOT NULL, - rel TEXT NOT NULL, -- TODO: enum? web, webarchive, repo, etc TODO: default web? - url TEXT NOT NULL + rel TEXT NOT NULL CHECK (octet_length(rel) >= 1), -- TODO: enum? web, webarchive, repo, etc + url TEXT NOT NULL CHECK (octet_length(url) >= 1) ); CREATE INDEX file_rev_url_rev_idx ON file_rev_url(file_rev); @@ -225,8 +230,8 @@ CREATE TABLE fileset_rev ( CREATE TABLE fileset_rev_url ( id BIGSERIAL PRIMARY KEY, fileset_rev UUID REFERENCES fileset_rev(id) NOT NULL, - rel TEXT NOT NULL, -- TODO: enum? web, webarchive, repo, etc TODO: default web? - url TEXT NOT NULL + rel TEXT NOT NULL CHECK (octet_length(rel) >= 1), -- TODO: enum? web, webarchive, repo, etc + url TEXT NOT NULL CHECK (octet_length(url) >= 1) ); CREATE INDEX fileset_rev_url_rev_idx ON fileset_rev_url(fileset_rev); @@ -234,7 +239,7 @@ CREATE INDEX fileset_rev_url_rev_idx ON fileset_rev_url(fileset_rev); CREATE TABLE fileset_rev_file ( id BIGSERIAL PRIMARY KEY, fileset_rev UUID REFERENCES fileset_rev(id) NOT NULL, - path_name TEXT NOT NULL, + path_name TEXT NOT NULL CHECK (octet_length(path_name) >= 1), size_bytes BIGINT NOT NULL, md5 TEXT CHECK(octet_length(md5) = 32), sha1 TEXT CHECK(octet_length(sha1) = 40), @@ -270,15 +275,15 @@ CREATE TABLE webcapture_rev ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), extra_json JSONB, - original_url TEXT NOT NULL, + original_url TEXT NOT NULL CHECK (octet_length(original_url) >= 1), timestamp TIMESTAMP WITH TIME ZONE NOT NULL ); CREATE TABLE webcapture_rev_url ( id BIGSERIAL PRIMARY KEY, webcapture_rev UUID REFERENCES webcapture_rev(id) NOT NULL, - rel TEXT NOT NULL, -- TODO: enum? web, webarchive, repo, etc TODO: default web? - url TEXT NOT NULL + rel TEXT NOT NULL CHECK (octet_length(rel) >= 1), -- TODO: enum? web, webarchive, repo, etc + url TEXT NOT NULL CHECK (octet_length(url) >= 1) ); CREATE INDEX webcapture_rev_url_rev_idx ON webcapture_rev_url(webcapture_rev); @@ -286,10 +291,10 @@ CREATE INDEX webcapture_rev_url_rev_idx ON webcapture_rev_url(webcapture_rev); CREATE TABLE webcapture_rev_cdx ( id BIGSERIAL PRIMARY KEY, webcapture_rev UUID REFERENCES webcapture_rev(id) NOT NULL, - surt TEXT NOT NULL, - timestamp TEXT NOT NULL, -- TODO: timestamp type? - url TEXT NOT NULL, - mimetype TEXT, + surt TEXT NOT NULL CHECK (octet_length(surt) >= 1), + timestamp TIMESTAMP WITH TIME ZONE NOT NULL, + url TEXT NOT NULL CHECK (octet_length(url) >= 1), + mimetype TEXT CHECK (octet_length(mimetype) >= 1), status_code BIGINT, sha1 TEXT CHECK(octet_length(sha1) = 40) NOT NULL, sha256 TEXT CHECK(octet_length(sha256) = 64) @@ -326,23 +331,28 @@ CREATE TABLE release_rev ( work_ident_id UUID NOT NULL, -- FOREIGN KEY; see ALRTER below container_ident_id UUID REFERENCES container_ident(id), - title TEXT NOT NULL, + refs_blob_sha1 TEXT REFERENCES refs_blob(sha1), + title TEXT NOT NULL CHECK (octet_length(title) >= 1), + original_title TEXT CHECK (octet_length(original_title) >= 1), release_type TEXT, -- TODO: enum release_status TEXT, -- TODO: enum release_date DATE, release_year BIGINT, - doi TEXT, + doi TEXT CHECK (octet_length(doi) >= 7), -- CHECK for length limit for data quality - pmid TEXT CHECK(octet_length(pmid) <= 12), - pmcid TEXT CHECK(octet_length(pmcid) <= 12), - wikidata_qid TEXT CHECK(octet_length(wikidata_qid) <= 12), - isbn13 TEXT CHECK(octet_length(isbn13) = 17), - core_id TEXT CHECK(octet_length(core_id) <= 12), - volume TEXT, - issue TEXT, - pages TEXT, - publisher TEXT, -- for books, NOT if container exists - language TEXT -- primary language of the work's fulltext; RFC1766/ISO639-1 + pmid TEXT CHECK (octet_length(pmid) <= 12), + pmcid TEXT CHECK (octet_length(pmcid) <= 12), + wikidata_qid TEXT CHECK (octet_length(wikidata_qid) <= 12), + isbn13 TEXT CHECK (octet_length(isbn13) = 17), + core_id TEXT CHECK (octet_length(core_id) <= 12), + arxiv_id TEXT CHECK (octet_length(arxiv_id) <= 12), + jstor_id TEXT CHECK (octet_length(jstor_id) <= 12), + volume TEXT CHECK (octet_length(volume) >= 1), + issue TEXT CHECK (octet_length(issue) >= 1), + pages TEXT CHECK (octet_length(pages) >= 1), + publisher TEXT CHECK (octet_length(publisher) >= 1), -- for books, NOT if container exists + language TEXT CHECK (octet_length(language) >= 1), -- primary language of the work's fulltext; RFC1766/ISO639-1 + license_slug TEXT CHECK (octet_length(license_slug) >= 1) -- TODO: oclc_ocn (TEXT or BIGINT) -- TODO: identifier table? ); @@ -353,14 +363,16 @@ CREATE INDEX release_rev_pmcid_idx ON release_rev(pmcid); CREATE INDEX release_rev_wikidata_idx ON release_rev(wikidata_qid); CREATE INDEX release_rev_isbn13_idx ON release_rev(isbn13); CREATE INDEX release_rev_core_idx ON release_rev(core_id); +CREATE INDEX release_rev_arxiv_idx ON release_rev(arxiv_id); +CREATE INDEX release_rev_jstor_idx ON release_rev(jstor_id); CREATE INDEX release_rev_work_idx ON release_rev(work_ident_id); CREATE TABLE release_rev_abstract ( id BIGSERIAL PRIMARY KEY, release_rev UUID REFERENCES release_rev(id) NOT NULL, abstract_sha1 TEXT REFERENCES abstracts(sha1) NOT NULL, - mimetype TEXT, - lang TEXT + mimetype TEXT CHECK (octet_length(mimetype) >= 1), + lang TEXT CHECK (octet_length(lang) >= 1) ); CREATE INDEX release_rev_abstract_rev_idx ON release_rev_abstract(release_rev); @@ -426,8 +438,9 @@ CREATE TABLE release_contrib ( id BIGSERIAL PRIMARY KEY, release_rev UUID REFERENCES release_rev(id) NOT NULL, creator_ident_id UUID REFERENCES creator_ident(id), - raw_name TEXT, + raw_name TEXT CHECK (octet_length(raw_name) >= 1), role TEXT, -- TODO: enum? + raw_affiliation TEXT CHECK (octet_length(raw_affiliation) >= 1), index_val INTEGER, extra_json JSONB ); @@ -436,20 +449,19 @@ CREATE INDEX release_contrib_rev_idx ON release_contrib(release_rev); CREATE INDEX release_contrib_creator_idx ON release_contrib(creator_ident_id); CREATE TABLE release_ref ( - id BIGSERIAL PRIMARY KEY, release_rev UUID REFERENCES release_rev(id) NOT NULL, - target_release_ident_id UUID REFERENCES release_ident(id), -- or work? - index_val INTEGER, - key TEXT, - extra_json JSONB, -- title, year, container_title, locator (aka, page), oci_id - container_name TEXT, - year INTEGER, - title TEXT, - locator TEXT - -- TODO: oci_id (TEXT) -); - -CREATE INDEX release_ref_rev_idx ON release_ref(release_rev); + index_val INTEGER NOT NULL, + target_release_ident_id UUID REFERENCES release_ident(id) NOT NULL, + -- all other fields are interned in refs_blob as JSONB + -- key TEXT, + -- extra_json JSONB, -- title, year, container_title, locator (aka, page), oci_id + -- container_name TEXT, + -- year INTEGER, + -- title TEXT, + -- locator TEXT + PRIMARY KEY(release_rev, index_val) +); + CREATE INDEX release_ref_target_release_idx ON release_ref(target_release_ident_id); CREATE TABLE file_rev_release ( @@ -516,10 +528,10 @@ INSERT INTO abstracts (sha1, content) VALUES ('1ba86bf8c2979a62d29b18b537e50b2b093be27e', 'some long abstract in plain text'), ('0da908ab584b5e445a06beb172e3fab8cb5169e3', '<jats>A longer, more correct abstract should in theory go here</jats>'); -INSERT INTO container_rev (id, name, publisher, issnl, abbrev, coden, extra_json) VALUES - ('00000000-0000-0000-1111-FFF000000001', 'MySpace Blog', null, null, null, null, null), - ('00000000-0000-0000-1111-FFF000000002', 'Journal of Trivial Results', 'bogus publishing group', '1234-5678', 'Triv. Res.', 'CDNXYZ', '{"is_oa": false, "in_doaj": false}'), - ('00000000-0000-0000-1111-FFF000000003', 'PLOS Medicine', 'Public Library of Science', '1549-1277', 'PLoS med.', null, '{"is_oa": true, "in_doaj": true}'); +INSERT INTO container_rev (id, name, publisher, issnl, extra_json) VALUES + ('00000000-0000-0000-1111-FFF000000001', 'MySpace Blog', null, null, null), + ('00000000-0000-0000-1111-FFF000000002', 'Journal of Trivial Results', 'bogus publishing group', '1234-5678', '{"is_oa": false, "in_doaj": false}'), + ('00000000-0000-0000-1111-FFF000000003', 'PLOS Medicine', 'Public Library of Science', '1549-1277', '{"is_oa": true, "in_doaj": true}'); INSERT INTO container_ident (id, is_live, rev_id, redirect_id) VALUES ('00000000-0000-0000-1111-000000000001', true, '00000000-0000-0000-1111-FFF000000001', null), -- aaaaaaaaaaaaaeiraaaaaaaaae @@ -598,9 +610,9 @@ INSERT INTO webcapture_rev (id, original_url, timestamp) VALUES ('00000000-0000-0000-7777-FFF000000003', 'https://asheesh.org', '2003-02-17T04:47:21Z'); INSERT INTO webcapture_rev_cdx (webcapture_rev, surt, timestamp, url, mimetype, status_code, sha1, sha256) VALUES - ('00000000-0000-0000-7777-FFF000000002', 'org,example)/', 19960102123456, 'http://example.org', null, 200, '5886903ba5aeaf7446fe9f77bd03adfc029cedf0', null), - ('00000000-0000-0000-7777-FFF000000003', 'org,asheesh)/', 20030217044721, 'http://asheesh.org:80/', 'text/html', 200, '5886903ba5aeaf7446fe9f77bd03adfc029cedf0', 'ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362'), - ('00000000-0000-0000-7777-FFF000000003', 'org,asheesh)/robots.txt', 20030217044719, 'http://asheesh.org:80/robots.txt', 'text/html', 404, 'a637f1d27d9bcb237310ed29f19c07e1c8cf0aa5', 'ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362'); + ('00000000-0000-0000-7777-FFF000000002', 'org,example)/', '1996-01-02T12:34:56Z', 'http://example.org', null, 200, '5886903ba5aeaf7446fe9f77bd03adfc029cedf0', null), + ('00000000-0000-0000-7777-FFF000000003', 'org,asheesh)/', '2003-02-17T04:47:21Z', 'http://asheesh.org:80/', 'text/html', 200, '5886903ba5aeaf7446fe9f77bd03adfc029cedf0', 'ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362'), + ('00000000-0000-0000-7777-FFF000000003', 'org,asheesh)/robots.txt', '2003-02-17T04:47:19Z', 'http://asheesh.org:80/robots.txt', 'text/html', 404, 'a637f1d27d9bcb237310ed29f19c07e1c8cf0aa5', 'ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362'); INSERT INTO webcapture_rev_url (webcapture_rev, rel, url) VALUES ('00000000-0000-0000-7777-FFF000000002', 'wayback', 'http://web.archive.org/201801010001/http://example.org'), @@ -632,10 +644,14 @@ INSERT INTO work_edit (ident_id, rev_id, redirect_id, editgroup_id, prev_rev) VA ('00000000-0000-0000-5555-000000000002', '00000000-0000-0000-5555-FFF000000002', null, '00000000-0000-0000-BBBB-000000000004', null), ('00000000-0000-0000-5555-000000000002', '00000000-0000-0000-5555-FFF000000003', null, '00000000-0000-0000-BBBB-000000000005', '00000000-0000-0000-5555-FFF000000002'); -INSERT INTO release_rev (id, work_ident_id, container_ident_id, title, release_type, release_status, release_date, release_year, doi, wikidata_qid, pmid, pmcid, isbn13, core_id, volume, issue, pages, publisher, language) VALUES - ('00000000-0000-0000-4444-FFF000000001', '00000000-0000-0000-5555-000000000001', null, 'example title', null, null, null, null, null, null, null, null, null, null, null, null, null, null, null), - ('00000000-0000-0000-4444-FFF000000002', '00000000-0000-0000-5555-000000000002', '00000000-0000-0000-1111-000000000001', 'bigger example', 'article-journal', null, '2018-01-01', 2018, '10.123/abc', 'Q55555', '54321', 'PMC555','978-3-16-148410-0', '42022773', '12', 'IV', '5-9', 'bogus publishing group', 'cn'), - ('00000000-0000-0000-4444-FFF000000003', '00000000-0000-0000-5555-000000000003', '00000000-0000-0000-1111-000000000003', 'Why Most Published Research Findings Are False', 'article-journal', 'published', '2005-08-30', 2005, '10.1371/journal.pmed.0020124', null, null, null, null, null, '2', '8', 'e124', 'Public Library of Science', 'en'); +INSERT INTO refs_blob (sha1, refs_json) VALUES + ('22222222c2979a62d29b18b537e50b2b093be27e', '[{}, {}, {}, {}, {"extra": {"unstructured":"citation note"}}]'), + ('33333333c2979a62d29b18b537e50b2b093be27e', '[{"extra": {"unstructured": "Ioannidis JP, Haidich AB, Lau J. Any casualties in the clash of randomised and observational evidence? BMJ. 2001;322:879–880"}}, {"extra": {"unstructured":"Lawlor DA, Davey Smith G, Kundu D, Bruckdorfer KR, Ebrahim S. Those confounded vitamins: What can we learn from the differences between observational versus randomised trial evidence? Lancet. 2004;363:1724–1727."}}, {"extra": {"unstructured":"Vandenbroucke JP. When are observational studies as credible as randomised trials? Lancet. 2004;363:1728–1731."}}, {"extra": {"unstructured":"Michiels S, Koscielny S, Hill C. Prediction of cancer outcome with microarrays: A multiple random validation strategy. Lancet. 2005;365:488–492."}}, {"extra": {"unstructured":"Ioannidis JPA, Ntzani EE, Trikalinos TA, Contopoulos-Ioannidis DG. Replication validity of genetic association studies. Nat Genet. 2001;29:306–309."}}, {"extra": {"unstructured":"Colhoun HM, McKeigue PM, Davey Smith G. Problems of reporting genetic associations with complex outcomes. Lancet. 2003;361:865–872."}}]'); + +INSERT INTO release_rev (id, work_ident_id, container_ident_id, title, release_type, release_status, release_date, release_year, doi, wikidata_qid, pmid, pmcid, isbn13, core_id, volume, issue, pages, publisher, language, refs_blob_sha1) VALUES + ('00000000-0000-0000-4444-FFF000000001', '00000000-0000-0000-5555-000000000001', null, 'example title', null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null), + ('00000000-0000-0000-4444-FFF000000002', '00000000-0000-0000-5555-000000000002', '00000000-0000-0000-1111-000000000001', 'bigger example', 'article-journal', null, '2018-01-01', 2018, '10.123/abc', 'Q55555', '54321', 'PMC555','978-3-16-148410-0', '42022773', '12', 'IV', '5-9', 'bogus publishing group', 'cn', '22222222c2979a62d29b18b537e50b2b093be27e'), + ('00000000-0000-0000-4444-FFF000000003', '00000000-0000-0000-5555-000000000003', '00000000-0000-0000-1111-000000000003', 'Why Most Published Research Findings Are False', 'article-journal', 'published', '2005-08-30', 2005, '10.1371/journal.pmed.0020124', null, null, null, null, null, '2', '8', 'e124', 'Public Library of Science', 'en', '33333333c2979a62d29b18b537e50b2b093be27e'); INSERT INTO release_ident (id, is_live, rev_id, redirect_id) VALUES ('00000000-0000-0000-4444-000000000001', true, '00000000-0000-0000-4444-FFF000000001', null), -- aaaaaaaaaaaaarceaaaaaaaaae @@ -656,15 +672,14 @@ INSERT INTO release_contrib (release_rev, creator_ident_id, raw_name, role, inde ('00000000-0000-0000-4444-FFF000000002', '00000000-0000-0000-2222-000000000002', 'some contrib', 'editor', 4), ('00000000-0000-0000-4444-FFF000000003', '00000000-0000-0000-2222-000000000003', 'John P. A. Ioannidis', 'author', 0); -INSERT INTO release_ref (release_rev, target_release_ident_id, index_val, extra_json) VALUES - ('00000000-0000-0000-4444-FFF000000002', null, null, null), - ('00000000-0000-0000-4444-FFF000000002', '00000000-0000-0000-4444-000000000001', 4, '{"unstructured":"citation note"}'), - ('00000000-0000-0000-4444-FFF000000003', null, 0, '{"unstructured": "Ioannidis JP, Haidich AB, Lau J. Any casualties in the clash of randomised and observational evidence? BMJ. 2001;322:879–880"}'), - ('00000000-0000-0000-4444-FFF000000003', null, 1, '{"unstructured":"Lawlor DA, Davey Smith G, Kundu D, Bruckdorfer KR, Ebrahim S. Those confounded vitamins: What can we learn from the differences between observational versus randomised trial evidence? Lancet. 2004;363:1724–1727."}'), - ('00000000-0000-0000-4444-FFF000000003', null, 2, '{"unstructured":"Vandenbroucke JP. When are observational studies as credible as randomised trials? Lancet. 2004;363:1728–1731."}'), - ('00000000-0000-0000-4444-FFF000000003', null, 3, '{"unstructured":"Michiels S, Koscielny S, Hill C. Prediction of cancer outcome with microarrays: A multiple random validation strategy. Lancet. 2005;365:488–492."}'), - ('00000000-0000-0000-4444-FFF000000003', null, 4, '{"unstructured":"Ioannidis JPA, Ntzani EE, Trikalinos TA, Contopoulos-Ioannidis DG. Replication validity of genetic association studies. Nat Genet. 2001;29:306–309."}'), - ('00000000-0000-0000-4444-FFF000000003', null, 5, '{"unstructured":"Colhoun HM, McKeigue PM, Davey Smith G. Problems of reporting genetic associations with complex outcomes. Lancet. 2003;361:865–872."}'); +INSERT INTO release_ref (release_rev, index_val, target_release_ident_id) VALUES + ('00000000-0000-0000-4444-FFF000000002', 4, '00000000-0000-0000-4444-000000000001'), -- '{"unstructured":"citation note"}'), + ('00000000-0000-0000-4444-FFF000000003', 0, '00000000-0000-0000-4444-000000000001'), --'{"unstructured": "Ioannidis JP, Haidich AB, Lau J. Any casualties in the clash of randomised and observational evidence? BMJ. 2001;322:879–880"}'), + ('00000000-0000-0000-4444-FFF000000003', 1, '00000000-0000-0000-4444-000000000001'), --'{"unstructured":"Lawlor DA, Davey Smith G, Kundu D, Bruckdorfer KR, Ebrahim S. Those confounded vitamins: What can we learn from the differences between observational versus randomised trial evidence? Lancet. 2004;363:1724–1727."}'), + ('00000000-0000-0000-4444-FFF000000003', 2, '00000000-0000-0000-4444-000000000001'), --'{"unstructured":"Vandenbroucke JP. When are observational studies as credible as randomised trials? Lancet. 2004;363:1728–1731."}'), + ('00000000-0000-0000-4444-FFF000000003', 3, '00000000-0000-0000-4444-000000000001'), --'{"unstructured":"Michiels S, Koscielny S, Hill C. Prediction of cancer outcome with microarrays: A multiple random validation strategy. Lancet. 2005;365:488–492."}'), + ('00000000-0000-0000-4444-FFF000000003', 4, '00000000-0000-0000-4444-000000000001'), --'{"unstructured":"Ioannidis JPA, Ntzani EE, Trikalinos TA, Contopoulos-Ioannidis DG. Replication validity of genetic association studies. Nat Genet. 2001;29:306–309."}'), + ('00000000-0000-0000-4444-FFF000000003', 5, '00000000-0000-0000-4444-000000000001'); --'{"unstructured":"Colhoun HM, McKeigue PM, Davey Smith G. Problems of reporting genetic associations with complex outcomes. Lancet. 2003;361:865–872."}'); INSERT INTO file_rev_release (file_rev, target_release_ident_id) VALUES ('00000000-0000-0000-3333-FFF000000002', '00000000-0000-0000-4444-000000000002'), diff --git a/rust/src/bin/fatcatd.rs b/rust/src/bin/fatcatd.rs index 75a6f000..ccce6725 100644 --- a/rust/src/bin/fatcatd.rs +++ b/rust/src/bin/fatcatd.rs @@ -88,7 +88,6 @@ fn main() -> Result<()> { server.metrics.incr("restart").unwrap(); } }; - info!(logger, "{:#?}", server.metrics); info!( logger, diff --git a/rust/src/database_models.rs b/rust/src/database_models.rs index 63fbcb29..adb38bda 100644 --- a/rust/src/database_models.rs +++ b/rust/src/database_models.rs @@ -3,8 +3,10 @@ use crate::database_schema::*; use crate::errors::*; use crate::identifiers::uuid2fcid; -use chrono; -use fatcat_api_spec::models::{ChangelogEntry, Editgroup, EditgroupAnnotation, Editor, EntityEdit}; +use chrono::Utc; +use fatcat_api_spec::models::{ + ChangelogEntry, Editgroup, EditgroupAnnotation, Editor, EntityEdit, ReleaseRef, +}; use serde_json; use uuid::Uuid; @@ -127,11 +129,10 @@ pub struct ContainerRevRow { pub id: Uuid, pub extra_json: Option<serde_json::Value>, pub name: String, + pub container_type: Option<String>, pub publisher: Option<String>, pub issnl: Option<String>, pub wikidata_qid: Option<String>, - pub abbrev: Option<String>, - pub coden: Option<String>, } #[derive(Debug, Associations, AsChangeset, Insertable)] @@ -139,11 +140,10 @@ pub struct ContainerRevRow { pub struct ContainerRevNewRow { pub extra_json: Option<serde_json::Value>, pub name: String, + pub container_type: Option<String>, pub publisher: Option<String>, pub issnl: Option<String>, pub wikidata_qid: Option<String>, - pub abbrev: Option<String>, - pub coden: Option<String>, } entity_structs!( @@ -305,7 +305,7 @@ pub struct WebcaptureRevCdxRow { pub id: i64, pub webcapture_rev: Uuid, pub surt: String, - pub timestamp: String, + pub timestamp: chrono::DateTime<Utc>, pub url: String, pub mimetype: Option<String>, pub status_code: Option<i64>, @@ -318,7 +318,7 @@ pub struct WebcaptureRevCdxRow { pub struct WebcaptureRevCdxNewRow { pub webcapture_rev: Uuid, pub surt: String, - pub timestamp: String, + pub timestamp: chrono::DateTime<Utc>, pub url: String, pub mimetype: Option<String>, pub status_code: Option<i64>, @@ -376,7 +376,9 @@ pub struct ReleaseRevRow { pub extra_json: Option<serde_json::Value>, pub work_ident_id: Uuid, pub container_ident_id: Option<Uuid>, + pub refs_blob_sha1: Option<String>, pub title: String, + pub original_title: Option<String>, pub release_type: Option<String>, pub release_status: Option<String>, pub release_date: Option<chrono::NaiveDate>, @@ -387,11 +389,14 @@ pub struct ReleaseRevRow { pub wikidata_qid: Option<String>, pub isbn13: Option<String>, pub core_id: Option<String>, + pub arxiv_id: Option<String>, + pub jstor_id: Option<String>, pub volume: Option<String>, pub issue: Option<String>, pub pages: Option<String>, pub publisher: Option<String>, pub language: Option<String>, + pub license_slug: Option<String>, } #[derive(Debug, Associations, AsChangeset, Insertable)] @@ -400,7 +405,9 @@ pub struct ReleaseRevNewRow { pub extra_json: Option<serde_json::Value>, pub work_ident_id: Uuid, pub container_ident_id: Option<Uuid>, + pub refs_blob_sha1: Option<String>, pub title: String, + pub original_title: Option<String>, pub release_type: Option<String>, pub release_status: Option<String>, pub release_date: Option<chrono::NaiveDate>, @@ -411,11 +418,14 @@ pub struct ReleaseRevNewRow { pub wikidata_qid: Option<String>, pub isbn13: Option<String>, pub core_id: Option<String>, + pub arxiv_id: Option<String>, + pub jstor_id: Option<String>, pub volume: Option<String>, pub issue: Option<String>, pub pages: Option<String>, pub publisher: Option<String>, pub language: Option<String>, + pub license_slug: Option<String>, } entity_structs!( @@ -476,6 +486,7 @@ pub struct ReleaseContribRow { pub creator_ident_id: Option<Uuid>, pub raw_name: Option<String>, pub role: Option<String>, + pub raw_affiliation: Option<String>, pub index_val: Option<i32>, pub extra_json: Option<serde_json::Value>, } @@ -487,39 +498,107 @@ pub struct ReleaseContribNewRow { pub creator_ident_id: Option<Uuid>, pub raw_name: Option<String>, pub role: Option<String>, + pub raw_affiliation: Option<String>, pub index_val: Option<i32>, pub extra_json: Option<serde_json::Value>, } -#[derive(Debug, Queryable, Identifiable, Associations, AsChangeset)] +#[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] #[table_name = "release_ref"] pub struct ReleaseRefRow { - pub id: i64, pub release_rev: Uuid, - pub target_release_ident_id: Option<Uuid>, - pub index_val: Option<i32>, - pub key: Option<String>, - pub extra_json: Option<serde_json::Value>, - pub container_name: Option<String>, - pub year: Option<i32>, - pub title: Option<String>, - pub locator: Option<String>, + pub index_val: i32, + pub target_release_ident_id: Uuid, } -#[derive(Debug, Insertable, AsChangeset)] -#[table_name = "release_ref"] -pub struct ReleaseRefNewRow { - pub release_rev: Uuid, - pub target_release_ident_id: Option<Uuid>, - pub index_val: Option<i32>, +#[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] +#[table_name = "refs_blob"] +pub struct RefsBlobRow { + pub sha1: String, + pub refs_json: serde_json::Value, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +/// This model is a stable representation of what goes in a RefsBlobRow `refs_json` field (an array +/// of this model). We could rely on the `ReleaseRef` API spec model directly, but that would lock +/// the database contents to the API spec rigidly; by defining this struct independently, we can +/// migrate the schemas. To start, this is a direct copy of the `ReleaseRef` model. +pub struct RefsBlobJson { + #[serde(rename = "index")] + #[serde(skip_serializing_if = "Option::is_none")] + pub index: Option<i64>, + + /// base32-encoded unique identifier + #[serde(rename = "target_release_id")] + #[serde(skip_serializing_if = "Option::is_none")] + pub target_release_id: Option<String>, + + #[serde(rename = "extra")] + #[serde(skip_serializing_if = "Option::is_none")] + pub extra: Option<serde_json::Value>, + + #[serde(rename = "key")] + #[serde(skip_serializing_if = "Option::is_none")] pub key: Option<String>, - pub extra_json: Option<serde_json::Value>, + + #[serde(rename = "year")] + #[serde(skip_serializing_if = "Option::is_none")] + pub year: Option<i64>, + + #[serde(rename = "container_name")] + #[serde(skip_serializing_if = "Option::is_none")] pub container_name: Option<String>, - pub year: Option<i32>, + + #[serde(rename = "title")] + #[serde(skip_serializing_if = "Option::is_none")] pub title: Option<String>, + + #[serde(rename = "locator")] + #[serde(skip_serializing_if = "Option::is_none")] pub locator: Option<String>, } +impl RefsBlobJson { + pub fn into_model(self) -> ReleaseRef { + ReleaseRef { + index: self.index, + target_release_id: self.target_release_id, + extra: self.extra, + key: self.key, + year: self.year, + container_name: self.container_name, + title: self.title, + locator: self.locator, + } + } + + pub fn to_model(&self) -> ReleaseRef { + ReleaseRef { + index: self.index, + target_release_id: self.target_release_id.clone(), + extra: self.extra.clone(), + key: self.key.clone(), + year: self.year, + container_name: self.container_name.clone(), + title: self.title.clone(), + locator: self.locator.clone(), + } + } + + pub fn from_model(model: &ReleaseRef) -> RefsBlobJson { + RefsBlobJson { + index: model.index, + target_release_id: model.target_release_id.clone(), + extra: model.extra.clone(), + key: model.key.clone(), + year: model.year, + container_name: model.container_name.clone(), + title: model.title.clone(), + locator: model.locator.clone(), + } + } +} + #[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] #[table_name = "file_rev_release"] pub struct FileRevReleaseRow { diff --git a/rust/src/database_schema.rs b/rust/src/database_schema.rs index 3bc57d95..ea184226 100644 --- a/rust/src/database_schema.rs +++ b/rust/src/database_schema.rs @@ -51,11 +51,10 @@ table! { id -> Uuid, extra_json -> Nullable<Jsonb>, name -> Text, + container_type -> Nullable<Text>, publisher -> Nullable<Text>, issnl -> Nullable<Text>, wikidata_qid -> Nullable<Text>, - abbrev -> Nullable<Text>, - coden -> Nullable<Text>, } } @@ -239,12 +238,20 @@ table! { } table! { + refs_blob (sha1) { + sha1 -> Text, + refs_json -> Jsonb, + } +} + +table! { release_contrib (id) { id -> Int8, release_rev -> Uuid, creator_ident_id -> Nullable<Uuid>, raw_name -> Nullable<Text>, role -> Nullable<Text>, + raw_affiliation -> Nullable<Text>, index_val -> Nullable<Int4>, extra_json -> Nullable<Jsonb>, } @@ -273,17 +280,10 @@ table! { } table! { - release_ref (id) { - id -> Int8, + release_ref (release_rev, index_val) { release_rev -> Uuid, - target_release_ident_id -> Nullable<Uuid>, - index_val -> Nullable<Int4>, - key -> Nullable<Text>, - extra_json -> Nullable<Jsonb>, - container_name -> Nullable<Text>, - year -> Nullable<Int4>, - title -> Nullable<Text>, - locator -> Nullable<Text>, + index_val -> Int4, + target_release_ident_id -> Uuid, } } @@ -293,7 +293,9 @@ table! { extra_json -> Nullable<Jsonb>, work_ident_id -> Uuid, container_ident_id -> Nullable<Uuid>, + refs_blob_sha1 -> Nullable<Text>, title -> Text, + original_title -> Nullable<Text>, release_type -> Nullable<Text>, release_status -> Nullable<Text>, release_date -> Nullable<Date>, @@ -304,11 +306,14 @@ table! { wikidata_qid -> Nullable<Text>, isbn13 -> Nullable<Text>, core_id -> Nullable<Text>, + arxiv_id -> Nullable<Text>, + jstor_id -> Nullable<Text>, volume -> Nullable<Text>, issue -> Nullable<Text>, pages -> Nullable<Text>, publisher -> Nullable<Text>, language -> Nullable<Text>, + license_slug -> Nullable<Text>, } } @@ -358,7 +363,7 @@ table! { id -> Int8, webcapture_rev -> Uuid, surt -> Text, - timestamp -> Text, + timestamp -> Timestamptz, url -> Text, mimetype -> Nullable<Text>, status_code -> Nullable<Int8>, @@ -439,6 +444,7 @@ joinable!(release_ident -> release_rev (rev_id)); joinable!(release_ref -> release_ident (target_release_ident_id)); joinable!(release_ref -> release_rev (release_rev)); joinable!(release_rev -> container_ident (container_ident_id)); +joinable!(release_rev -> refs_blob (refs_blob_sha1)); joinable!(release_rev -> work_ident (work_ident_id)); joinable!(release_rev_abstract -> abstracts (abstract_sha1)); joinable!(release_rev_abstract -> release_rev (release_rev)); @@ -475,6 +481,7 @@ allow_tables_to_appear_in_same_query!( fileset_rev_file, fileset_rev_release, fileset_rev_url, + refs_blob, release_contrib, release_edit, release_ident, diff --git a/rust/src/editing.rs b/rust/src/editing.rs index e181e8a7..c17e5964 100644 --- a/rust/src/editing.rs +++ b/rust/src/editing.rs @@ -42,6 +42,8 @@ pub fn make_edit_context( editor_id: FatcatId, editgroup_id: Option<FatcatId>, autoaccept: bool, + description: Option<String>, + extra: Option<serde_json::Value>, ) -> Result<EditContext> { // *either* autoaccept is false and editgroup_id is Some, *or* autoaccept is true and // editgroup_id is None @@ -54,8 +56,8 @@ pub fn make_edit_context( editor: None, changelog_index: None, submitted: None, - description: None, - extra: None, + description: description, + extra: extra, annotations: None, edits: None, }; diff --git a/rust/src/endpoint_handlers.rs b/rust/src/endpoint_handlers.rs index bc606af9..d9bd3403 100644 --- a/rust/src/endpoint_handlers.rs +++ b/rust/src/endpoint_handlers.rs @@ -26,9 +26,11 @@ macro_rules! entity_batch_handler { autoaccept: bool, editor_id: FatcatId, editgroup_id: Option<FatcatId>, + description: Option<String>, + extra: Option<serde_json::Value>, ) -> Result<Vec<EntityEdit>> { - let edit_context = make_edit_context(conn, editor_id, editgroup_id, autoaccept)?; + let edit_context = make_edit_context(conn, editor_id, editgroup_id, autoaccept, description, extra)?; edit_context.check(&conn)?; let model_list: Vec<&models::$model> = entity_list.iter().map(|e| e).collect(); let edits = $model::db_create_batch(conn, &edit_context, model_list.as_slice())?; @@ -259,71 +261,99 @@ impl Server { pmid: &Option<String>, pmcid: &Option<String>, core_id: &Option<String>, + arxiv_id: &Option<String>, + jstor_id: &Option<String>, expand_flags: ExpandFlags, hide_flags: HideFlags, ) -> Result<ReleaseEntity> { - let (ident, rev): (ReleaseIdentRow, ReleaseRevRow) = - match (doi, wikidata_qid, isbn13, pmid, pmcid, core_id) { - (Some(doi), None, None, None, None, None) => { - check_doi(doi)?; - release_ident::table - .inner_join(release_rev::table) - .filter(release_rev::doi.eq(doi)) - .filter(release_ident::is_live.eq(true)) - .filter(release_ident::redirect_id.is_null()) - .first(conn)? - } - (None, Some(wikidata_qid), None, None, None, None) => { - check_wikidata_qid(wikidata_qid)?; - release_ident::table - .inner_join(release_rev::table) - .filter(release_rev::wikidata_qid.eq(wikidata_qid)) - .filter(release_ident::is_live.eq(true)) - .filter(release_ident::redirect_id.is_null()) - .first(conn)? - } - (None, None, Some(isbn13), None, None, None) => { - // TODO: check_isbn13(isbn13)?; - release_ident::table - .inner_join(release_rev::table) - .filter(release_rev::isbn13.eq(isbn13)) - .filter(release_ident::is_live.eq(true)) - .filter(release_ident::redirect_id.is_null()) - .first(conn)? - } - (None, None, None, Some(pmid), None, None) => { - check_pmid(pmid)?; - release_ident::table - .inner_join(release_rev::table) - .filter(release_rev::pmid.eq(pmid)) - .filter(release_ident::is_live.eq(true)) - .filter(release_ident::redirect_id.is_null()) - .first(conn)? - } - (None, None, None, None, Some(pmcid), None) => { - check_pmcid(pmcid)?; - release_ident::table - .inner_join(release_rev::table) - .filter(release_rev::pmcid.eq(pmcid)) - .filter(release_ident::is_live.eq(true)) - .filter(release_ident::redirect_id.is_null()) - .first(conn)? - } - (None, None, None, None, None, Some(core_id)) => { - // TODO: check_core_id(core_id)?; - release_ident::table - .inner_join(release_rev::table) - .filter(release_rev::core_id.eq(core_id)) - .filter(release_ident::is_live.eq(true)) - .filter(release_ident::redirect_id.is_null()) - .first(conn)? - } - _ => { - return Err( - FatcatError::MissingOrMultipleExternalId("in lookup".to_string()).into(), - ); - } - }; + let (ident, rev): (ReleaseIdentRow, ReleaseRevRow) = match ( + doi, + wikidata_qid, + isbn13, + pmid, + pmcid, + core_id, + arxiv_id, + jstor_id, + ) { + (Some(doi), None, None, None, None, None, None, None) => { + check_doi(doi)?; + release_ident::table + .inner_join(release_rev::table) + .filter(release_rev::doi.eq(doi)) + .filter(release_ident::is_live.eq(true)) + .filter(release_ident::redirect_id.is_null()) + .first(conn)? + } + (None, Some(wikidata_qid), None, None, None, None, None, None) => { + check_wikidata_qid(wikidata_qid)?; + release_ident::table + .inner_join(release_rev::table) + .filter(release_rev::wikidata_qid.eq(wikidata_qid)) + .filter(release_ident::is_live.eq(true)) + .filter(release_ident::redirect_id.is_null()) + .first(conn)? + } + (None, None, Some(isbn13), None, None, None, None, None) => { + // TODO: check_isbn13(isbn13)?; + release_ident::table + .inner_join(release_rev::table) + .filter(release_rev::isbn13.eq(isbn13)) + .filter(release_ident::is_live.eq(true)) + .filter(release_ident::redirect_id.is_null()) + .first(conn)? + } + (None, None, None, Some(pmid), None, None, None, None) => { + check_pmid(pmid)?; + release_ident::table + .inner_join(release_rev::table) + .filter(release_rev::pmid.eq(pmid)) + .filter(release_ident::is_live.eq(true)) + .filter(release_ident::redirect_id.is_null()) + .first(conn)? + } + (None, None, None, None, Some(pmcid), None, None, None) => { + check_pmcid(pmcid)?; + release_ident::table + .inner_join(release_rev::table) + .filter(release_rev::pmcid.eq(pmcid)) + .filter(release_ident::is_live.eq(true)) + .filter(release_ident::redirect_id.is_null()) + .first(conn)? + } + (None, None, None, None, None, Some(core_id), None, None) => { + // TODO: check_core_id(core_id)?; + release_ident::table + .inner_join(release_rev::table) + .filter(release_rev::core_id.eq(core_id)) + .filter(release_ident::is_live.eq(true)) + .filter(release_ident::redirect_id.is_null()) + .first(conn)? + } + (None, None, None, None, None, None, Some(arxiv_id), None) => { + // TODO: check_arxiv_id(arxiv_id)?; + release_ident::table + .inner_join(release_rev::table) + .filter(release_rev::arxiv_id.eq(arxiv_id)) + .filter(release_ident::is_live.eq(true)) + .filter(release_ident::redirect_id.is_null()) + .first(conn)? + } + (None, None, None, None, None, None, None, Some(jstor_id)) => { + // TODO: check_jstor_id(jstor_id)?; + release_ident::table + .inner_join(release_rev::table) + .filter(release_rev::jstor_id.eq(jstor_id)) + .filter(release_ident::is_live.eq(true)) + .filter(release_ident::redirect_id.is_null()) + .first(conn)? + } + _ => { + return Err( + FatcatError::MissingOrMultipleExternalId("in lookup".to_string()).into(), + ); + } + }; let mut entity = ReleaseEntity::db_from_row(conn, rev, Some(ident), hide_flags)?; entity.db_expand(&conn, expand_flags)?; diff --git a/rust/src/endpoints.rs b/rust/src/endpoints.rs index f7e93448..2e467957 100644 --- a/rust/src/endpoints.rs +++ b/rust/src/endpoints.rs @@ -120,7 +120,7 @@ macro_rules! wrap_entity_handlers { let auth_context = self.auth_confectionary.require_auth(&conn, &context.auth_data, Some(stringify!($post_fn)))?; auth_context.require_role(FatcatRole::Editor)?; auth_context.require_editgroup(&conn, editgroup_id)?; - let edit_context = make_edit_context(&conn, auth_context.editor_id, Some(editgroup_id), false)?; + let edit_context = make_edit_context(&conn, auth_context.editor_id, Some(editgroup_id), false, None, None)?; edit_context.check(&conn)?; entity.db_create(&conn, &edit_context)?.into_model() }).map_err(|e| FatcatError::from(e)) { @@ -138,18 +138,30 @@ macro_rules! wrap_entity_handlers { entity_list: &Vec<models::$model>, autoaccept: Option<bool>, editgroup_id: Option<String>, + description: Option<String>, + extra: Option<String>, context: &Context, ) -> Box<Future<Item = $post_batch_resp, Error = ApiError> + Send> { let conn = self.db_pool.get().expect("db_pool error"); let ret = match conn.transaction(|| { let auth_context = self.auth_confectionary.require_auth(&conn, &context.auth_data, Some(stringify!($post_batch_fn)))?; - auth_context.require_role(FatcatRole::Editor)?; + let autoaccept = autoaccept.unwrap_or(false); + if autoaccept { + auth_context.require_role(FatcatRole::Admin)?; + } else { + auth_context.require_role(FatcatRole::Editor)?; + }; let editgroup_id = if let Some(s) = editgroup_id { + // make_edit_context() checks for "both editgroup_id and autosubmit" error case let eg_id = FatcatId::from_str(&s)?; auth_context.require_editgroup(&conn, eg_id)?; Some(eg_id) } else { None }; - self.$post_batch_handler(&conn, entity_list, autoaccept.unwrap_or(false), auth_context.editor_id, editgroup_id) + let extra: Option<serde_json::Value> = match extra { + Some(v) => serde_json::from_str(&v)?, + None => None, + }; + self.$post_batch_handler(&conn, entity_list, autoaccept, auth_context.editor_id, editgroup_id, description, extra) }).map_err(|e| FatcatError::from(e)) { Ok(edits) => { self.metrics.count("entities.created", edits.len() as i64).ok(); @@ -178,7 +190,7 @@ macro_rules! wrap_entity_handlers { auth_context.require_role(FatcatRole::Editor)?; let entity_id = FatcatId::from_str(&ident)?; auth_context.require_editgroup(&conn, editgroup_id)?; - let edit_context = make_edit_context(&conn, auth_context.editor_id, Some(editgroup_id), false)?; + let edit_context = make_edit_context(&conn, auth_context.editor_id, Some(editgroup_id), false, None, None)?; edit_context.check(&conn)?; entity.db_update(&conn, &edit_context, entity_id)?.into_model() }).map_err(|e| FatcatError::from(e)) { @@ -204,7 +216,7 @@ macro_rules! wrap_entity_handlers { auth_context.require_role(FatcatRole::Editor)?; let entity_id = FatcatId::from_str(&ident)?; auth_context.require_editgroup(&conn, editgroup_id)?; - let edit_context = make_edit_context(&conn, auth_context.editor_id, Some(editgroup_id), false)?; + let edit_context = make_edit_context(&conn, auth_context.editor_id, Some(editgroup_id), false, None, None)?; edit_context.check(&conn)?; $model::db_delete(&conn, &edit_context, entity_id)?.into_model() }).map_err(|e| FatcatError::from(e)) { @@ -659,6 +671,8 @@ impl Api for Server { pmid: Option<String>, pmcid: Option<String>, core_id: Option<String>, + arxiv_id: Option<String>, + jstor_id: Option<String>, expand: Option<String>, hide: Option<String>, _context: &Context, @@ -682,6 +696,8 @@ impl Api for Server { &pmid, &pmcid, &core_id, + &arxiv_id, + &jstor_id, expand_flags, hide_flags, ) diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs index ce1c1ed7..a92c45a6 100644 --- a/rust/src/entity_crud.rs +++ b/rust/src/entity_crud.rs @@ -8,7 +8,7 @@ use crate::database_models::*; use crate::database_schema::*; use crate::editing::EditContext; -use crate::endpoint_handlers::get_release_files; +use crate::endpoint_handlers::{get_release_files, get_release_filesets, get_release_webcaptures}; use crate::errors::*; use crate::identifiers::*; use crate::server::DbConn; @@ -798,8 +798,7 @@ impl EntityCrud for ContainerEntity { wikidata_qid: None, publisher: None, name: None, - abbrev: None, - coden: None, + container_type: None, state: Some(ident_row.state().unwrap().shortname()), ident: Some(FatcatId::from_uuid(&ident_row.id).to_string()), revision: ident_row.rev_id.map(|u| u.to_string()), @@ -831,8 +830,7 @@ impl EntityCrud for ContainerEntity { wikidata_qid: rev_row.wikidata_qid, publisher: rev_row.publisher, name: Some(rev_row.name), - abbrev: rev_row.abbrev, - coden: rev_row.coden, + container_type: rev_row.container_type, state, ident: ident_id, revision: Some(rev_row.id.to_string()), @@ -869,8 +867,7 @@ impl EntityCrud for ContainerEntity { publisher: model.publisher.clone(), issnl: model.issnl.clone(), wikidata_qid: model.wikidata_qid.clone(), - abbrev: model.abbrev.clone(), - coden: model.coden.clone(), + container_type: model.container_type.clone(), extra_json: model.extra.clone(), }) .collect::<Vec<ContainerRevNewRow>>(), @@ -1619,6 +1616,7 @@ impl EntityCrud for ReleaseEntity { Ok(ReleaseEntity { title: None, + original_title: None, release_type: None, release_status: None, release_date: None, @@ -1627,8 +1625,10 @@ impl EntityCrud for ReleaseEntity { pmid: None, pmcid: None, isbn13: None, - core_id: None, wikidata_qid: None, + core_id: None, + arxiv_id: None, + jstor_id: None, volume: None, issue: None, pages: None, @@ -1639,6 +1639,7 @@ impl EntityCrud for ReleaseEntity { container_id: None, publisher: None, language: None, + license_slug: None, work_id: None, refs: None, contribs: None, @@ -1675,6 +1676,26 @@ impl EntityCrud for ReleaseEntity { }; self.files = Some(get_release_files(conn, ident, HideFlags::none())?); } + if expand.filesets && self.ident.is_some() { + let ident = match &self.ident { + None => bail!("Can't expand filesets on a non-concrete entity"), // redundant with above is_some() + Some(ident) => match &self.redirect { + None => FatcatId::from_str(&ident)?, + Some(redir) => FatcatId::from_str(&redir)?, + }, + }; + self.filesets = Some(get_release_filesets(conn, ident, HideFlags::none())?); + } + if expand.webcaptures && self.ident.is_some() { + let ident = match &self.ident { + None => bail!("Can't expand webcaptures on a non-concrete entity"), // redundant with above is_some() + Some(ident) => match &self.redirect { + None => FatcatId::from_str(&ident)?, + Some(redir) => FatcatId::from_str(&redir)?, + }, + }; + self.webcaptures = Some(get_release_webcaptures(conn, ident, HideFlags::none())?); + } if expand.container { if let Some(ref cid) = self.container_id { self.container = Some(ContainerEntity::db_get( @@ -1812,28 +1833,28 @@ impl EntityCrud for ReleaseEntity { None => (None, None, None), }; - let refs: Option<Vec<ReleaseRef>> = match hide.refs { - true => None, - false => Some( - release_ref::table + let refs: Option<Vec<ReleaseRef>> = match (hide.refs, rev_row.refs_blob_sha1) { + (true, _) => None, + (false, None) => Some(vec![]), + (false, Some(sha1)) => Some({ + let refs_blob: RefsBlobRow = refs_blob::table + .find(sha1) // checked in match + .get_result(conn)?; + let refs: Vec<RefsBlobJson> = serde_json::from_value(refs_blob.refs_json)?; + let mut refs: Vec<ReleaseRef> = refs.into_iter().map(|j| j.into_model()).collect(); + let ref_rows: Vec<ReleaseRefRow> = release_ref::table .filter(release_ref::release_rev.eq(rev_row.id)) .order(release_ref::index_val.asc()) - .get_results(conn)? - .into_iter() - .map(|r: ReleaseRefRow| ReleaseRef { - index: r.index_val.map(|v| v as i64), - key: r.key, - extra: r.extra_json, - container_name: r.container_name, - year: r.year.map(|v| v as i64), - title: r.title, - locator: r.locator, - target_release_id: r - .target_release_ident_id - .map(|v| FatcatId::from_uuid(&v).to_string()), - }) - .collect(), - ), + .get_results(conn)?; + for index in 0..refs.len() { + refs[index].index = Some(index as i64) + } + for row in ref_rows { + refs[row.index_val as usize].target_release_id = + Some(FatcatId::from_uuid(&row.target_release_ident_id).to_string()); + } + refs + }), }; let contribs: Option<Vec<ReleaseContrib>> = match hide.contribs { @@ -1851,6 +1872,7 @@ impl EntityCrud for ReleaseEntity { index: c.index_val.map(|v| v as i64), raw_name: c.raw_name, role: c.role, + raw_affiliation: c.raw_affiliation, extra: c.extra_json, creator_id: c .creator_ident_id @@ -1884,6 +1906,7 @@ impl EntityCrud for ReleaseEntity { Ok(ReleaseEntity { title: Some(rev_row.title), + original_title: rev_row.original_title, release_type: rev_row.release_type, release_status: rev_row.release_status, release_date: rev_row.release_date, @@ -1892,8 +1915,10 @@ impl EntityCrud for ReleaseEntity { pmid: rev_row.pmid, pmcid: rev_row.pmcid, isbn13: rev_row.isbn13, - core_id: rev_row.core_id, wikidata_qid: rev_row.wikidata_qid, + core_id: rev_row.core_id, + arxiv_id: rev_row.arxiv_id, + jstor_id: rev_row.jstor_id, volume: rev_row.volume, issue: rev_row.issue, pages: rev_row.pages, @@ -1906,6 +1931,7 @@ impl EntityCrud for ReleaseEntity { .map(|u| FatcatId::from_uuid(&u).to_string()), publisher: rev_row.publisher, language: rev_row.language, + license_slug: rev_row.license_slug, work_id: Some(FatcatId::from_uuid(&rev_row.work_ident_id).to_string()), refs, contribs, @@ -1934,6 +1960,7 @@ impl EntityCrud for ReleaseEntity { if let Some(ref extid) = entity.wikidata_qid { check_wikidata_qid(extid)?; } + // TODO: JSTOR and arxiv IDs if let Some(ref release_type) = entity.release_type { check_release_type(release_type)?; } @@ -1953,13 +1980,65 @@ impl EntityCrud for ReleaseEntity { .into()); } + // First, calculate and upsert any refs JSON blobs and record the SHA1 keys, so they can be + // included in the release_rev row itself + let mut refs_blob_rows: Vec<RefsBlobRow> = vec![]; + let mut refs_blob_sha1: Vec<Option<String>> = vec![]; + for model in models.iter() { + match &model.refs { + None => { + refs_blob_sha1.push(None); + } + Some(ref_list) => { + if ref_list.is_empty() { + refs_blob_sha1.push(None); + continue; + } + // Have to strip out target refs and indexes, or hashing won't work well when + // these change + let ref_list: Vec<RefsBlobJson> = ref_list + .iter() + .map(|r: &ReleaseRef| { + let mut r = RefsBlobJson::from_model(r); + r.target_release_id = None; + r.index = None; + r + }) + .collect(); + // TODO: maybe `canonical_json` crate? + let refs_json = serde_json::to_value(ref_list)?; + let refs_str = refs_json.to_string(); + let sha1 = Sha1::from(refs_str).hexdigest(); + let blob = RefsBlobRow { + sha1: sha1.clone(), + refs_json, + }; + refs_blob_rows.push(blob); + refs_blob_sha1.push(Some(sha1)); + } + }; + } + + if !refs_blob_rows.is_empty() { + // Sort of an "upsert"; only inserts new abstract rows if they don't already exist + insert_into(refs_blob::table) + .values(&refs_blob_rows) + .on_conflict(refs_blob::sha1) + .do_nothing() + .execute(conn)?; + } + + // Then the main release_revs themselves let rev_ids: Vec<Uuid> = insert_into(release_rev::table) .values( models .iter() - .map(|model| { + .zip(refs_blob_sha1.into_iter()) + .map(|(model, refs_sha1)| { Ok(ReleaseRevNewRow { + refs_blob_sha1: refs_sha1, title: model.title.clone().unwrap(), // titles checked above + original_title: model.original_title.clone(), release_type: model.release_type.clone(), release_status: model.release_status.clone(), release_date: model.release_date, @@ -1970,6 +2049,8 @@ impl EntityCrud for ReleaseEntity { wikidata_qid: model.wikidata_qid.clone(), isbn13: model.isbn13.clone(), core_id: model.core_id.clone(), + arxiv_id: model.arxiv_id.clone(), + jstor_id: model.jstor_id.clone(), volume: model.volume.clone(), issue: model.issue.clone(), pages: model.pages.clone(), @@ -1983,6 +2064,7 @@ impl EntityCrud for ReleaseEntity { }, publisher: model.publisher.clone(), language: model.language.clone(), + license_slug: model.license_slug.clone(), extra_json: model.extra.clone() }) }) @@ -1991,34 +2073,32 @@ impl EntityCrud for ReleaseEntity { .returning(release_rev::id) .get_results(conn)?; - let mut release_ref_rows: Vec<ReleaseRefNewRow> = vec![]; + let mut release_ref_rows: Vec<ReleaseRefRow> = vec![]; let mut release_contrib_rows: Vec<ReleaseContribNewRow> = vec![]; let mut abstract_rows: Vec<AbstractsRow> = vec![]; let mut release_abstract_rows: Vec<ReleaseRevAbstractNewRow> = vec![]; for (model, rev_id) in models.iter().zip(rev_ids.iter()) { + // We didn't know the release_rev id to insert here, so need to re-iterate over refs match &model.refs { None => (), Some(ref_list) => { - let these_ref_rows: Vec<ReleaseRefNewRow> = ref_list + let these_ref_rows: Vec<ReleaseRefRow> = ref_list .iter() - .map(|r| { - Ok(ReleaseRefNewRow { + .enumerate() + .filter(|(_, r)| r.target_release_id.is_some()) + .map(|(index, r)| { + Ok(ReleaseRefRow { release_rev: *rev_id, - target_release_ident_id: match r.target_release_id.clone() { - None => None, - Some(v) => Some(FatcatId::from_str(&v)?.to_uuid()), - }, - index_val: r.index.map(|v| v as i32), - key: r.key.clone(), - container_name: r.container_name.clone(), - year: r.year.map(|v| v as i32), - title: r.title.clone(), - locator: r.locator.clone(), - extra_json: r.extra.clone(), + // unwrap() checked by is_some() filter + target_release_ident_id: FatcatId::from_str( + &r.target_release_id.clone().unwrap(), + )? + .to_uuid(), + index_val: index as i32, }) }) - .collect::<Result<Vec<ReleaseRefNewRow>>>()?; + .collect::<Result<Vec<ReleaseRefRow>>>()?; release_ref_rows.extend(these_ref_rows); } }; @@ -2038,6 +2118,7 @@ impl EntityCrud for ReleaseEntity { raw_name: c.raw_name.clone(), index_val: c.index.map(|v| v as i32), role: c.role.clone(), + raw_affiliation: c.raw_affiliation.clone(), extra_json: c.extra.clone(), }) }) @@ -2053,7 +2134,7 @@ impl EntityCrud for ReleaseEntity { .iter() .filter(|ea| ea.content.is_some()) .map(|c| AbstractsRow { - sha1: Sha1::from(c.content.clone().unwrap()).hexdigest(), + sha1: Sha1::from(c.content.as_ref().unwrap()).hexdigest(), content: c.content.clone().unwrap(), }) .collect(); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b7661334..d089adf8 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -9,6 +9,8 @@ extern crate log; extern crate lazy_static; #[macro_use] extern crate failure; +#[macro_use] +extern crate serde_derive; pub mod auth; pub mod database_models; diff --git a/rust/tests/test_api_server_http.rs b/rust/tests/test_api_server_http.rs index 0ec2650a..66f36a14 100644 --- a/rust/tests/test_api_server_http.rs +++ b/rust/tests/test_api_server_http.rs @@ -694,7 +694,7 @@ fn test_post_webcapture() { "timestamp": "2018-12-28T05:06:07Z", "cdx": [ {"surt": "org,asheesh,)/robots.txt", - "timestamp": "20181228050607", + "timestamp": "2018-12-28T05:06:07Z", "url": "https://asheesh.org/robots.txt", "status_code": 200, "mimetype": "text/html", @@ -1409,7 +1409,7 @@ fn test_post_batch_autoaccept() { None, ); - // "n" + // "n" (TODO) let editgroup_id = helpers::quick_editgroup(&conn); helpers::check_http_response( request::post( @@ -1421,7 +1421,7 @@ fn test_post_batch_autoaccept() { r#"[{"name": "test journal"}, {"name": "another test journal"}]"#, &router, ), - status::Created, + status::BadRequest, // TODO None, ); @@ -1813,3 +1813,86 @@ fn test_editgroup_annotations() { Some("special test annotation"), ); } + +#[test] +fn test_query_params() { + let (headers, router, _conn) = helpers::setup_http(); + + helpers::check_http_response( + request::get( + "http://localhost:9411/v0/changelog?limit=true", + headers.clone(), + &router, + ), + status::BadRequest, + Some("integer"), + ); + + helpers::check_http_response( + request::get( + &format!("http://localhost:9411/v0/editgroup/reviewable?since=asdf"), + headers.clone(), + &router, + ), + status::BadRequest, + Some("datetime"), + ); + + helpers::check_http_response( + request::get( + &format!("http://localhost:9411/v0/editgroup/reviewable?since=1999-06-05T12:34:00Z"), + headers.clone(), + &router, + ), + status::Ok, + None, + ); + + // Python3: datetime.datetime.utcnow().isoformat() + "Z" + helpers::check_http_response( + request::get( + &format!( + "http://localhost:9411/v0/editgroup/reviewable?since=2019-01-17T23:32:03.269010Z" + ), + headers.clone(), + &router, + ), + status::Ok, + None, + ); + + // Python3: datetime.datetime.now(datetime.timezone.utc).isoformat() + /* TODO: this doesn't work currently :( + helpers::check_http_response( + request::get( + &format!("http://localhost:9411/v0/editgroup/reviewable?since=2019-01-17T23:30:45.799289+00:00"), + headers.clone(), + &router, + ), + status::Ok, + None, + ); + */ + + helpers::check_http_response( + request::post( + "http://localhost:9411/v0/container/batch?autoaccept=asdf", + headers.clone(), + r#"[{"name": "test journal"}, {"name": "another test journal"}]"#, + &router, + ), + status::BadRequest, + Some("boolean"), + ); + + helpers::check_http_response( + request::post( + "http://localhost:9411/v0/container/batch?autoaccept=True", + headers.clone(), + r#"[{"name": "test journal"}, {"name": "another test journal"}]"#, + &router, + ), + status::Created, + None, + ); +} diff --git a/rust/tests/test_refs.rs b/rust/tests/test_refs.rs new file mode 100644 index 00000000..ae4be4b5 --- /dev/null +++ b/rust/tests/test_refs.rs @@ -0,0 +1,169 @@ +use diesel::prelude::*; +use fatcat::database_models::*; +use fatcat::database_schema::*; +use fatcat::editing::{accept_editgroup, make_edit_context}; +use fatcat::entity_crud::{EntityCrud, HideFlags}; +use fatcat::identifiers::FatcatId; +use fatcat::server; +use fatcat_api_spec::models::*; +use std::str::FromStr; +use uuid::Uuid; + +mod helpers; + +#[test] +fn test_refs_blob() { + let server = server::create_test_server().unwrap(); + let conn = server.db_pool.get().expect("db_pool error"); + let editor_id = FatcatId::from_str(helpers::TEST_ADMIN_EDITOR_ID).unwrap(); + let editgroup_id = helpers::quick_editgroup(&conn); + let edit_context = + make_edit_context(&conn, editor_id, Some(editgroup_id), false, None, None).unwrap(); + + // this release entity should be unchanged after being inserted/fetched + let mut r1 = ReleaseEntity::new(); + r1.title = Some("release-test hashes".to_string()); + r1.refs = Some(vec![ + ReleaseRef { + index: Some(0), + target_release_id: None, + extra: None, + key: Some("one".to_string()), + year: Some(1932), + container_name: Some("bogus container".to_string()), + title: Some("first bogus paper".to_string()), + locator: Some("p100".to_string()), + }, + ReleaseRef { + index: Some(1), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaai".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(2032), + container_name: Some("bogus other container".to_string()), + title: Some("second bogus paper".to_string()), + locator: Some("p200".to_string()), + }, + ]); + + // this release entity should have the same hash as r1. the indexes will change after fetching, + // but otherwise the fetched refs should be the same as the r1 fetched results. + let mut r2 = r1.clone(); + r2.refs = Some(vec![ + ReleaseRef { + index: None, + target_release_id: None, + extra: None, + key: Some("one".to_string()), + year: Some(1932), + container_name: Some("bogus container".to_string()), + title: Some("first bogus paper".to_string()), + locator: Some("p100".to_string()), + }, + ReleaseRef { + index: Some(99), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaai".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(2032), + container_name: Some("bogus other container".to_string()), + title: Some("second bogus paper".to_string()), + locator: Some("p200".to_string()), + }, + ]); + + // this release entity has different ref *targets* and indexes, but should still have the same + // refs_blob hashes as r1/r2. + let mut r3 = r1.clone(); + r3.refs = Some(vec![ + ReleaseRef { + index: Some(1), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaae".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(1932), + container_name: Some("bogus container".to_string()), + title: Some("first bogus paper".to_string()), + locator: Some("p100".to_string()), + }, + ReleaseRef { + index: Some(1), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaam".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(2032), + container_name: Some("bogus other container".to_string()), + title: Some("second bogus paper".to_string()), + locator: Some("p200".to_string()), + }, + ]); + + // this one is obviously just plain different (hashes shouldn't match) + let mut r4 = r1.clone(); + r4.refs = Some(vec![ReleaseRef { + index: Some(1), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaae".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(1932), + container_name: Some("bogus container".to_string()), + title: Some("first bogus paper".to_string()), + locator: Some("p100".to_string()), + }]); + + let edit1 = r1.db_create(&conn, &edit_context).unwrap(); + let edit2 = r2.db_create(&conn, &edit_context).unwrap(); + let edit3 = r3.db_create(&conn, &edit_context).unwrap(); + let edit4 = r4.db_create(&conn, &edit_context).unwrap(); + + let r1b = ReleaseEntity::db_get(&conn, edit1.ident_id.into(), HideFlags::none()).unwrap(); + let r2b = ReleaseEntity::db_get(&conn, edit2.ident_id.into(), HideFlags::none()).unwrap(); + let r3b = ReleaseEntity::db_get(&conn, edit3.ident_id.into(), HideFlags::none()).unwrap(); + let r4b = ReleaseEntity::db_get(&conn, edit4.ident_id.into(), HideFlags::none()).unwrap(); + assert_eq!(r1b.refs, r1.refs); + assert_eq!(r1b.refs, r2b.refs); + assert_ne!(r1b.refs, r3b.refs); + assert_ne!(r1b.refs, r4b.refs); + + let r1_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r1b.revision.clone().unwrap()).unwrap()) + .get_result(&conn) + .unwrap(); + let r2_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r2b.revision.unwrap()).unwrap()) + .get_result(&conn) + .unwrap(); + let r3_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r3b.revision.clone().unwrap()).unwrap()) + .get_result(&conn) + .unwrap(); + let r4_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r4b.revision.unwrap()).unwrap()) + .get_result(&conn) + .unwrap(); + assert_eq!(r1_row.refs_blob_sha1, r2_row.refs_blob_sha1); + assert_eq!(r1_row.refs_blob_sha1, r3_row.refs_blob_sha1); + assert_ne!(r1_row.refs_blob_sha1, r4_row.refs_blob_sha1); + + // ensure that SHA1 hashing is stable over time (as much as possible!) + assert_eq!( + r1_row.refs_blob_sha1, + Some("4e38812fbf99e00e0cb648896e9f7a9d58c5ab23".to_string()) + ); + + // update r1 with new target_idents (r3); SHA1 row still shouldn't change + accept_editgroup(&conn, editgroup_id).unwrap(); + let editgroup_id = helpers::quick_editgroup(&conn); + let edit_context = + make_edit_context(&conn, editor_id, Some(editgroup_id), false, None, None).unwrap(); + + let _edit4 = r3b + .db_update(&conn, &edit_context, edit1.ident_id.into()) + .unwrap(); + let r1c = ReleaseEntity::db_get(&conn, edit1.ident_id.into(), HideFlags::none()).unwrap(); + let r1c_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r1c.revision.unwrap()).unwrap()) + .get_result(&conn) + .unwrap(); + assert_eq!(r1_row.refs_blob_sha1, r1c_row.refs_blob_sha1); +} |