diff options
-rw-r--r-- | README.md | 14 | ||||
-rwxr-xr-x | chocula.py | 20 | ||||
-rwxr-xr-x | data/fetch.sh | 39 |
3 files changed, 35 insertions, 38 deletions
@@ -67,18 +67,22 @@ In order of precedence (first higher than later): - Norwegian Registry - Original: <https://dbh.nsd.uib.no/publiseringskanaler/AlltidFerskListe> - Snapshot: <https://archive.org/download/norwegian_register_journals> -- Wikidata (TODO: Journal-level not title-level) - - Original: <http://uri.gbv.de/wikicite/20180903/> - - Snapshot: <https://archive.org/download/wikicite-biblio-data-20180903> +- Wikidata via SPARQL Query + - SPARQL: <https://archive.org/download/wikidata-journal-metadata/wikidata.sparql> + - Snapshot: <https://archive.org/download/wikidata-journal-metadata> - KBART reports: LOCKSS, CLOCKSS, Portico - Original: (multiple, see README in IA item) - - Snapshot: <https://archive.org/download/keepers_reports_201901> + - Snapshot: <https://archive.org/download/keepers_reports_201912> - JSTOR - Original: <https://support.jstor.org/hc/en-us/articles/115007466248-JSTOR-title-lists> - - Snapshot: <KBART jstor_all-archive-titles.txt> - Crossref title list (not DOIs) - Original: <https://wwwold.crossref.org/titlelist/titleFile.csv> - Snapshot: <https://archive.org/download/crossref_doi_titles> +- OpenAPC Dataset + - Original: <https://github.com/OpenAPC/openapc-de/blob/master/data/apc_de.csv> + - Snapshot: <https://archive.org/download/openapc-dataset> +- EZB Metadata + - Snapshot: <https://archive.org/download/ezb_snapshot_2019-07-11> - IA SIM Microfilm catalog - Original: <https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.xlsx> - IA homepage crawl attempts @@ -58,31 +58,31 @@ import stdnum.issn ################### File Config -ISSNL_FILE = 'data/20190730.ISSN-to-ISSN-L.txt' +ISSNL_FILE = 'data/20191220.ISSN-to-ISSN-L.txt' ENTREZ_FILE = 'data/entrez-journals.csv' ROAD_FILE = 'data/road-2018-01-24.tsv' ROAD_DATE = '2018-01-24' -DOAJ_FILE = 'data/journalcsv__doaj_20190731_0130_utf8.csv' -DOAJ_DATE = '2019-07-31' -CROSSREF_FILE = 'data/doi_titles_file_2019-08-17.csv' +DOAJ_FILE = 'data/journalcsv__doaj_20191221_0135_utf8.csv' +DOAJ_DATE = '2019-12-21' +CROSSREF_FILE = 'data/doi_titles_file_2019-12-20.csv' SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv' SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv' -NORWEGIAN_FILE = 'data/2018-03-02 Norwegian Register for Scientific Journals and Series.csv' -NORWEGIAN_DATE = '2018-03-02' +NORWEGIAN_FILE = 'data/2019-12-21 Norwegian Register for Scientific Journals and Series.csv' +NORWEGIAN_DATE = '2019-12-21' LOCKSS_FILE = 'data/kbart_LOCKSS.txt' CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt' PORTICO_FILE = 'data/Portico_Holding_KBart.txt' -JSTOR_FILE = 'data/jstor_all-archive-titles.txt' +JSTOR_FILE = 'data/JSTOR_Global_AllArchiveTitles_2019-12-21.txt' SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv' IA_CRAWL_FILE = 'data/url_status.2019-07-31.partial-ia.json' SZCZEPANSKI_DATE = '2018' SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json' EZB_FILE = 'data/ezb_metadata.json' GOLD_OA_FILE = 'data/ISSN_Gold-OA_3.0.csv' -WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-07-30.tsv' -OPENAPC_FILE = 'data/apc_de.2019-07-30.csv' -FATCAT_CONTAINER_FILE = 'data/container_export.2019-09-03.json' +WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-12-20.tsv' +OPENAPC_FILE = 'data/apc_de.2019-12-20.csv' +FATCAT_CONTAINER_FILE = 'data/container_export.2019-12-13.json' FATCAT_STATS_FILE = 'data/container_stats.json' diff --git a/data/fetch.sh b/data/fetch.sh index 182953d..d6b3bab 100755 --- a/data/fetch.sh +++ b/data/fetch.sh @@ -6,13 +6,11 @@ set -eu #unzip -n road-2018-01-24-export-issn.zip wget -c https://archive.org/download/road-issn-2018/road-2018-01-24.tsv -#wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv -wget -c https://archive.org/download/doaj_bulk_metadata_2019/journalcsv__doaj_20190731_0130_utf8.csv +wget -c https://archive.org/download/doaj_bulk_metadata_2019/journalcsv__doaj_20191221_0135_utf8.csv -#wget -c https://archive.org/download/issn_issnl_mappings/20190129.ISSN-to-ISSN-L.txt -wget -c https://archive.org/download/issn_issnl_mappings/20190730.ISSN-to-ISSN-L.txt +wget -c https://archive.org/download/issn_issnl_mappings/20191220.ISSN-to-ISSN-L.txt -wget -c https://archive.org/download/crossref_doi_titles/doi_titles_file_2019-08-17.csv +wget -c https://archive.org/download/crossref_doi_titles/doi_titles_file_2019-12-20.csv #wget -c https://archive.org/download/ncbi-entrez-2019/J_Entrez.txt -O ncbi-entrez-2019.txt @@ -20,21 +18,16 @@ wget -c https://archive.org/download/moreo.info-2018-12-20/romeo-journals.csv wget -c https://archive.org/download/moreo.info-2018-12-20/romeo-policies.csv wget -c https://archive.org/download/moreo.info-2018-12-20/entrez-journals.csv -wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv +wget -c https://archive.org/download/keepers_reports_201912/JSTOR_Global_AllArchiveTitles_2019-12-21.txt +#wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_AllCurrentJournalTitles_2019-01-07.txt +#wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_EarlyJournalContent_2017-06-08.txt +wget -c https://archive.org/download/keepers_reports_201912/kbart_CLOCKSS.txt +wget -c https://archive.org/download/keepers_reports_201912/kbart_LOCKSS.txt +wget -c https://archive.org/download/keepers_reports_201912/Portico_Holding_KBart.txt -wget -c https://archive.org/download/keepers_reports_201901/jstor_all-archive-titles.txt -wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_AllCurrentJournalTitles_2019-01-07.txt -wget -c https://archive.org/download/keepers_reports_201901/JSTOR_Global_EarlyJournalContent_2017-06-08.txt -wget -c https://archive.org/download/keepers_reports_201901/kbart_CLOCKSS.txt -wget -c https://archive.org/download/keepers_reports_201901/kbart_LOCKSS.txt -wget -c https://archive.org/download/keepers_reports_201901/Portico_Holding_KBart.txt - -wget -c https://archive.org/download/ia_journal_metadata_explore_2018-04-05/journal_homepage_results.partial.tsv - -#wget -c https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.xlsx wget -c https://archive.org/download/SerialsOnMicrofilmCollection/MASTER%20TITLE_METADATA_LIST_20171019.converted.csv -wget -c https://archive.org/download/norwegian_register_journals/2018-03-02%20Norwegian%20Register%20for%20Scientific%20Journals%20and%20Series.csv +wget -c https://archive.org/download/norwegian_register_journals/2019-12-21%20Norwegian%20Register%20for%20Scientific%20Journals%20and%20Series.csv #wget -c https://archive.org/download/open_academic_graph_2019/mag_venues.zip #unzip mag_venues.zip @@ -43,11 +36,11 @@ wget -c https://archive.org/download/szczepanski-oa-journal-list-2018/Jan-Szczep wget -c https://archive.org/download/ezb_snapshot_2019-07-11/ezb_metadata.json wget -c https://archive.org/download/ISSN-GOLD-OA-3/ISSN_Gold-OA_3.0.csv -wget -c https://archive.org/download/openapc-dataset/apc_de.2019-07-30.csv -wget -c https://archive.org/download/wikidata-journal-metadata/wikidata_journals_sparql.2019-07-30.tsv +wget -c https://archive.org/download/openapc-dataset/apc_de.2019-12-20.csv +wget -c https://archive.org/download/wikidata-journal-metadata/wikidata_journals_sparql.2019-12-20.tsv -wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-ia.json -wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-robocracy.json +#wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-ia.json +#wget -c https://archive.org/download/chocula-journal-counts/url_status.2019-07-31.partial-robocracy.json -wget -c https://archive.org/download/fatcat_bulk_exports_2019-07-07/container_export.2019-09-03.json.gz -zcat container_export.2019-09-03.json.gz > container_export.2019-09-03.json +wget -c https://archive.org/download/fatcat_bulk_exports_2019-12-13/container_export.json.gz +zcat container_export.json.gz > container_export.2019-12-13.json |