From faaefd2a2a998551b50b7de5c8e231d53b55882a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 23 Dec 2019 19:11:36 -0800 Subject: update chocula input data files Including updating fetch script, README links, and chocula.py path references. --- README.md | 14 +++++++++----- chocula.py | 20 ++++++++++---------- data/fetch.sh | 39 ++++++++++++++++----------------------- 3 files changed, 35 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 8cb0169..110a43c 100644 --- a/README.md +++ b/README.md @@ -67,18 +67,22 @@ In order of precedence (first higher than later): - Norwegian Registry - Original: - Snapshot: -- Wikidata (TODO: Journal-level not title-level) - - Original: - - Snapshot: +- Wikidata via SPARQL Query + - SPARQL: + - Snapshot: - KBART reports: LOCKSS, CLOCKSS, Portico - Original: (multiple, see README in IA item) - - Snapshot: + - Snapshot: - JSTOR - Original: - - Snapshot: - Crossref title list (not DOIs) - Original: - Snapshot: +- OpenAPC Dataset + - Original: + - Snapshot: +- EZB Metadata + - Snapshot: - IA SIM Microfilm catalog - Original: - IA homepage crawl attempts diff --git a/chocula.py b/chocula.py index aab4fc3..c8173a3 100755 --- a/chocula.py +++ b/chocula.py @@ -58,31 +58,31 @@ import stdnum.issn ################### File Config -ISSNL_FILE = 'data/20190730.ISSN-to-ISSN-L.txt' +ISSNL_FILE = 'data/20191220.ISSN-to-ISSN-L.txt' ENTREZ_FILE = 'data/entrez-journals.csv' ROAD_FILE = 'data/road-2018-01-24.tsv' ROAD_DATE = '2018-01-24' -DOAJ_FILE = 'data/journalcsv__doaj_20190731_0130_utf8.csv' -DOAJ_DATE = '2019-07-31' -CROSSREF_FILE = 'data/doi_titles_file_2019-08-17.csv' +DOAJ_FILE = 'data/journalcsv__doaj_20191221_0135_utf8.csv' +DOAJ_DATE = '2019-12-21' +CROSSREF_FILE = 'data/doi_titles_file_2019-12-20.csv' SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv' SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv' -NORWEGIAN_FILE = 'data/2018-03-02 Norwegian Register for Scientific Journals and Series.csv' -NORWEGIAN_DATE = '2018-03-02' +NORWEGIAN_FILE = 'data/2019-12-21 Norwegian Register for Scientific Journals and Series.csv' +NORWEGIAN_DATE = '2019-12-21' LOCKSS_FILE = 'data/kbart_LOCKSS.txt' CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt' PORTICO_FILE = 'data/Portico_Holding_KBart.txt' -JSTOR_FILE = 'data/jstor_all-archive-titles.txt' +JSTOR_FILE = 'data/JSTOR_Global_AllArchiveTitles_2019-12-21.txt' SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv' IA_CRAWL_FILE = 'data/url_status.2019-07-31.partial-ia.json' SZCZEPANSKI_DATE = '2018' SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json' EZB_FILE = 'data/ezb_metadata.json' GOLD_OA_FILE = 'data/ISSN_Gold-OA_3.0.csv' -WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-07-30.tsv' -OPENAPC_FILE = 'data/apc_de.2019-07-30.csv' -FATCAT_CONTAINER_FILE = 'data/container_export.2019-09-03.json' +WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-12-20.tsv' +OPENAPC_FILE = 'data/apc_de.2019-12-20.csv' +FATCAT_CONTAINER_FILE = 'data/container_export.2019-12-13.json' FATCAT_STATS_FILE = 'data/container_stats.json' diff --git a/data/fetch.sh b/data/fetch.sh index 182953d..d6b3bab 100755 --- a/data/fetch.sh +++ b/data/fetch.sh @@ -6,13 +6,11 @@ set -eu #unzip -n road-2018-01-24-export-issn.zip wget -c https://archive.org/download/road-issn-2018/road-2018-01-24.tsv -#wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv -wget -c https://archive.org/download/doaj_bulk_metadata_2019/journalcsv__doaj_20190731_0130_utf8.csv +wget -c https://archive.org/download/doaj_bulk_metadata_2019/journalcsv__doaj_20191221_0135_utf8.csv -#wget -c https://archive.org/download/issn_issnl_mappings/20190129.ISSN-to-ISSN-L.txt -wget -c https://archive.org/download/issn_issnl_mappings/20190730.ISSN-to-ISSN-L.txt +wget -c https://archive.org/download/issn_issnl_mappings/20191220.ISSN-to-ISSN-L.txt -wget -c https://archive.org/download/crossref_doi_titles/doi_titles_file_2019-08-17.csv +wget -c https://archive.org/download/crossref_doi_titles/doi_titles_file_2019-12-20.csv #wget -c https://archive.org/download/ncbi-entrez-2019/J_Entrez.txt -O ncbi-entrez-2019.txt @@ -20,21 +18,16 @@ wget -c https://archive.org/download/moreo.info-2018-12-20/romeo-journals.csv wget -c https://archive.org/download/moreo.info-2018-12-20/romeo-policies.csv wget -c https://archive.org/download/moreo.info-2018-12-20/entrez-journals.csv -wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv +wget -c https://archive.org/download/keepers_reports_201912/JSTOR_Global_AllArchiveTitles_2019-12-21.txt +#wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_AllCurrentJournalTitles_2019-01-07.txt +#wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_EarlyJournalContent_2017-06-08.txt +wget -c https://archive.org/download/keepers_reports_201912/kbart_CLOCKSS.txt +wget -c https://archive.org/download/keepers_reports_201912/kbart_LOCKSS.txt +wget -c https://archive.org/download/keepers_reports_201912/Portico_Holding_KBart.txt -wget -c https://archive.org/download/keepers_reports_201901/jstor_all-archive-titles.txt -wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_AllCurrentJournalTitles_2019-01-07.txt -wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_EarlyJournalContent_2017-06-08.txt -wget -c https://archive.org/download/keepers_reports_201901/kbart_CLOCKSS.txt -wget -c https://archive.org/download/keepers_reports_201901/kbart_LOCKSS.txt -wget -c https://archive.org/download/keepers_reports_201901/Portico_Holding_KBart.txt - -wget -c https://archive.org/download/ia_journal_metadata_explore_2018-04-05/journal_homepage_results.partial.tsv - -#wget -c https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.xlsx wget -c https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.converted.csv -wget -c https://archive.org/download/norwegian_register_journals/2018-03-02%20Norwegian%20Register%20for%20Scientific%20Journals%20and%20Series.csv +wget -c https://archive.org/download/norwegian_register_journals/2019-12-21%20Norwegian%20Register%20for%20Scientific%20Journals%20and%20Series.csv #wget -c https://archive.org/download/open_academic_graph_2019/mag_venues.zip #unzip mag_venues.zip @@ -43,11 +36,11 @@ wget -c https://archive.org/download/szczepanski-oa-journal-list-2018/Jan-Szczep wget -c https://archive.org/download/ezb_snapshot_2019-07-11/ezb_metadata.json wget -c https://archive.org/download/ISSN-GOLD-OA-3/ISSN_Gold-OA_3.0.csv -wget -c https://archive.org/download/openapc-dataset/apc_de.2019-07-30.csv -wget -c https://archive.org/download/wikidata-journal-metadata/wikidata_journals_sparql.2019-07-30.tsv +wget -c https://archive.org/download/openapc-dataset/apc_de.2019-12-20.csv +wget -c https://archive.org/download/wikidata-journal-metadata/wikidata_journals_sparql.2019-12-20.tsv -wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-ia.json -wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-robocracy.json +#wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-ia.json +#wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-robocracy.json -wget -c https://archive.org/download/fatcat_bulk_exports_2019-07-07/container_export.2019-09-03.json.gz -zcat container_export.2019-09-03.json.gz > container_export.2019-09-03.json +wget -c https://archive.org/download/fatcat_bulk_exports_2019-12-13/container_export.json.gz +zcat container_export.json.gz > container_export.2019-12-13.json -- cgit v1.2.3