aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-23 19:11:36 -0800
committerBryan Newbold <bnewbold@archive.org>2019-12-23 19:11:38 -0800
commitfaaefd2a2a998551b50b7de5c8e231d53b55882a (patch)
tree29831a6c152733b56e8c4f21f984e096b192b658
parent55a00912586a2aea705687472578dc9e8486be5e (diff)
downloadchocula-faaefd2a2a998551b50b7de5c8e231d53b55882a.tar.gz
chocula-faaefd2a2a998551b50b7de5c8e231d53b55882a.zip
update chocula input data files
Including updating fetch script, README links, and chocula.py path references.
-rw-r--r--README.md14
-rwxr-xr-xchocula.py20
-rwxr-xr-xdata/fetch.sh39
3 files changed, 35 insertions, 38 deletions
diff --git a/README.md b/README.md
index 8cb0169..110a43c 100644
--- a/README.md
+++ b/README.md
@@ -67,18 +67,22 @@ In order of precedence (first higher than later):
- Norwegian Registry
- Original: <https://dbh.nsd.uib.no/publiseringskanaler/AlltidFerskListe>
- Snapshot: <https://archive.org/download/norwegian_register_journals>
-- Wikidata (TODO: Journal-level not title-level)
- - Original: <http://uri.gbv.de/wikicite/20180903/>
- - Snapshot: <https://archive.org/download/wikicite-biblio-data-20180903>
+- Wikidata via SPARQL Query
+ - SPARQL: <https://archive.org/download/wikidata-journal-metadata/wikidata.sparql>
+ - Snapshot: <https://archive.org/download/wikidata-journal-metadata>
- KBART reports: LOCKSS, CLOCKSS, Portico
- Original: (multiple, see README in IA item)
- - Snapshot: <https://archive.org/download/keepers_reports_201901>
+ - Snapshot: <https://archive.org/download/keepers_reports_201912>
- JSTOR
- Original: <https://support.jstor.org/hc/en-us/articles/115007466248-JSTOR-title-lists>
- - Snapshot: <KBART jstor_all-archive-titles.txt>
- Crossref title list (not DOIs)
- Original: <https://wwwold.crossref.org/titlelist/titleFile.csv>
- Snapshot: <https://archive.org/download/crossref_doi_titles>
+- OpenAPC Dataset
+ - Original: <https://github.com/OpenAPC/openapc-de/blob/master/data/apc_de.csv>
+ - Snapshot: <https://archive.org/download/openapc-dataset>
+- EZB Metadata
+ - Snapshot: <https://archive.org/download/ezb_snapshot_2019-07-11>
- IA SIM Microfilm catalog
- Original: <https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.xlsx>
- IA homepage crawl attempts
diff --git a/chocula.py b/chocula.py
index aab4fc3..c8173a3 100755
--- a/chocula.py
+++ b/chocula.py
@@ -58,31 +58,31 @@ import stdnum.issn
################### File Config
-ISSNL_FILE = 'data/20190730.ISSN-to-ISSN-L.txt'
+ISSNL_FILE = 'data/20191220.ISSN-to-ISSN-L.txt'
ENTREZ_FILE = 'data/entrez-journals.csv'
ROAD_FILE = 'data/road-2018-01-24.tsv'
ROAD_DATE = '2018-01-24'
-DOAJ_FILE = 'data/journalcsv__doaj_20190731_0130_utf8.csv'
-DOAJ_DATE = '2019-07-31'
-CROSSREF_FILE = 'data/doi_titles_file_2019-08-17.csv'
+DOAJ_FILE = 'data/journalcsv__doaj_20191221_0135_utf8.csv'
+DOAJ_DATE = '2019-12-21'
+CROSSREF_FILE = 'data/doi_titles_file_2019-12-20.csv'
SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv'
SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv'
-NORWEGIAN_FILE = 'data/2018-03-02 Norwegian Register for Scientific Journals and Series.csv'
-NORWEGIAN_DATE = '2018-03-02'
+NORWEGIAN_FILE = 'data/2019-12-21 Norwegian Register for Scientific Journals and Series.csv'
+NORWEGIAN_DATE = '2019-12-21'
LOCKSS_FILE = 'data/kbart_LOCKSS.txt'
CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt'
PORTICO_FILE = 'data/Portico_Holding_KBart.txt'
-JSTOR_FILE = 'data/jstor_all-archive-titles.txt'
+JSTOR_FILE = 'data/JSTOR_Global_AllArchiveTitles_2019-12-21.txt'
SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv'
IA_CRAWL_FILE = 'data/url_status.2019-07-31.partial-ia.json'
SZCZEPANSKI_DATE = '2018'
SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json'
EZB_FILE = 'data/ezb_metadata.json'
GOLD_OA_FILE = 'data/ISSN_Gold-OA_3.0.csv'
-WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-07-30.tsv'
-OPENAPC_FILE = 'data/apc_de.2019-07-30.csv'
-FATCAT_CONTAINER_FILE = 'data/container_export.2019-09-03.json'
+WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-12-20.tsv'
+OPENAPC_FILE = 'data/apc_de.2019-12-20.csv'
+FATCAT_CONTAINER_FILE = 'data/container_export.2019-12-13.json'
FATCAT_STATS_FILE = 'data/container_stats.json'
diff --git a/data/fetch.sh b/data/fetch.sh
index 182953d..d6b3bab 100755
--- a/data/fetch.sh
+++ b/data/fetch.sh
@@ -6,13 +6,11 @@ set -eu
#unzip -n road-2018-01-24-export-issn.zip
wget -c https://archive.org/download/road-issn-2018/road-2018-01-24.tsv
-#wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv
-wget -c https://archive.org/download/doaj_bulk_metadata_2019/journalcsv__doaj_20190731_0130_utf8.csv
+wget -c https://archive.org/download/doaj_bulk_metadata_2019/journalcsv__doaj_20191221_0135_utf8.csv
-#wget -c https://archive.org/download/issn_issnl_mappings/20190129.ISSN-to-ISSN-L.txt
-wget -c https://archive.org/download/issn_issnl_mappings/20190730.ISSN-to-ISSN-L.txt
+wget -c https://archive.org/download/issn_issnl_mappings/20191220.ISSN-to-ISSN-L.txt
-wget -c https://archive.org/download/crossref_doi_titles/doi_titles_file_2019-08-17.csv
+wget -c https://archive.org/download/crossref_doi_titles/doi_titles_file_2019-12-20.csv
#wget -c https://archive.org/download/ncbi-entrez-2019/J_Entrez.txt -O ncbi-entrez-2019.txt
@@ -20,21 +18,16 @@ wget -c https://archive.org/download/moreo.info-2018-12-20/romeo-journals.csv
wget -c https://archive.org/download/moreo.info-2018-12-20/romeo-policies.csv
wget -c https://archive.org/download/moreo.info-2018-12-20/entrez-journals.csv
-wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv
+wget -c https://archive.org/download/keepers_reports_201912/JSTOR_Global_AllArchiveTitles_2019-12-21.txt
+#wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_AllCurrentJournalTitles_2019-01-07.txt
+#wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_EarlyJournalContent_2017-06-08.txt
+wget -c https://archive.org/download/keepers_reports_201912/kbart_CLOCKSS.txt
+wget -c https://archive.org/download/keepers_reports_201912/kbart_LOCKSS.txt
+wget -c https://archive.org/download/keepers_reports_201912/Portico_Holding_KBart.txt
-wget -c https://archive.org/download/keepers_reports_201901/jstor_all-archive-titles.txt
-wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_AllCurrentJournalTitles_2019-01-07.txt
-wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_EarlyJournalContent_2017-06-08.txt
-wget -c https://archive.org/download/keepers_reports_201901/kbart_CLOCKSS.txt
-wget -c https://archive.org/download/keepers_reports_201901/kbart_LOCKSS.txt
-wget -c https://archive.org/download/keepers_reports_201901/Portico_Holding_KBart.txt
-
-wget -c https://archive.org/download/ia_journal_metadata_explore_2018-04-05/journal_homepage_results.partial.tsv
-
-#wget -c https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.xlsx
wget -c https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.converted.csv
-wget -c https://archive.org/download/norwegian_register_journals/2018-03-02%20Norwegian%20Register%20for%20Scientific%20Journals%20and%20Series.csv
+wget -c https://archive.org/download/norwegian_register_journals/2019-12-21%20Norwegian%20Register%20for%20Scientific%20Journals%20and%20Series.csv
#wget -c https://archive.org/download/open_academic_graph_2019/mag_venues.zip
#unzip mag_venues.zip
@@ -43,11 +36,11 @@ wget -c https://archive.org/download/szczepanski-oa-journal-list-2018/Jan-Szczep
wget -c https://archive.org/download/ezb_snapshot_2019-07-11/ezb_metadata.json
wget -c https://archive.org/download/ISSN-GOLD-OA-3/ISSN_Gold-OA_3.0.csv
-wget -c https://archive.org/download/openapc-dataset/apc_de.2019-07-30.csv
-wget -c https://archive.org/download/wikidata-journal-metadata/wikidata_journals_sparql.2019-07-30.tsv
+wget -c https://archive.org/download/openapc-dataset/apc_de.2019-12-20.csv
+wget -c https://archive.org/download/wikidata-journal-metadata/wikidata_journals_sparql.2019-12-20.tsv
-wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-ia.json
-wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-robocracy.json
+#wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-ia.json
+#wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-robocracy.json
-wget -c https://archive.org/download/fatcat_bulk_exports_2019-07-07/container_export.2019-09-03.json.gz
-zcat container_export.2019-09-03.json.gz > container_export.2019-09-03.json
+wget -c https://archive.org/download/fatcat_bulk_exports_2019-12-13/container_export.json.gz
+zcat container_export.json.gz > container_export.2019-12-13.json