aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 11:45:14 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 11:45:14 -0700
commit8b473ecef2287acae2b466269b6b091eaa064bf2 (patch)
tree3653406e37048efd5297338a55f62fc4784785af
parent8c5d4ee034c8aee42a87e33234cb9327892f5d50 (diff)
downloadfatcat-covid19-8b473ecef2287acae2b466269b6b091eaa064bf2.tar.gz
fatcat-covid19-8b473ecef2287acae2b466269b6b091eaa064bf2.zip
improvements/fixes to Makefile
-rw-r--r--Makefile15
1 files changed, 7 insertions, 8 deletions
diff --git a/Makefile b/Makefile
index 45a8213..e4c237f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
-TODAY := $(shell date --iso --utc)
-CORDDATE := $(TODAY)
+TODAY ?= $(shell date --iso --utc)
+CORDDATE ?= $(TODAY)
SHELL = /bin/bash
.SHELLFLAGS = -o pipefail -c
@@ -26,7 +26,7 @@ update-i18n: ## Re-extract and re-compile translation files
metadata/$(CORDDATE)/cord19.csv:
mkdir -p metadata/$(CORDDATE)
@#wget -c "https://archive.org/download/s2-cord19-dataset/cord19.$(CORDDATE).csv" -O /tmp/cord19.$(CORDDATE).csv
- wget -c "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv" -O /tmp/cord19.$(CORDDATE).csv
+ wget -c "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/$(CORDDATE)/metadata.csv" -O /tmp/cord19.$(CORDDATE).csv
mv /tmp/cord19.$(CORDDATE).csv $@
metadata/$(CORDDATE)/cord19.json: metadata/$(CORDDATE)/cord19.csv
@@ -43,7 +43,7 @@ metadata/$(CORDDATE)/cord19.missing.json: metadata/$(CORDDATE)/cord19.enrich.jso
metadata/$(TODAY)/fatcat_hits.enrich.json:
mkdir -p metadata/$(TODAY)
- pipenv run ./covid19_tool.py query-fatcat | pv -l > $@
+ pipenv run ./covid19_tool.py query-fatcat | pv -l > $@.wip
mv $@.wip $@
metadata/$(TODAY)/combined.enrich.json: metadata/$(CORDDATE)/cord19.enrich.json metadata/$(TODAY)/fatcat_hits.enrich.json
@@ -59,10 +59,9 @@ metadata/$(TODAY)/derivatives.stamp: metadata/$(TODAY)/fatcat_web.log
touch $@
metadata/$(TODAY)/combined.fulltext.json: metadata/$(TODAY)/derivatives.stamp metadata/$(TODAY)/combined.enrich.json
- ./covid19_tool.py enrich-derivatives metadata/$(TODAY)/combined.enrich.json --base-dir fulltext_web/ | pv -l > $@.wip
+ pipenv run ./covid19_tool.py enrich-derivatives metadata/$(TODAY)/combined.enrich.json --base-dir fulltext_web/ | pv -l > $@.wip
mv $@.wip $@
-
.PHONY: corpus
corpus: metadata/$(TODAY)/combined.fulltext.json ## Run ingest, resulting in combined fulltext JSON corpus on disk
@echo "Successfully built corpus for date (UTC): $(TODAY)"
@@ -73,8 +72,8 @@ create-es-index:
.PHONY: load-es
load-es: metadata/$(TODAY)/combined.fulltext.json ## Push current corpus into elasticsearch index
- pipenv run ./covid19_tool.py metadata/$(TODAY)/combined.fulltext.json | pv -l | esbulk -verbose -size 1000 -id fatcat_ident -w 8 -index covid19_fatcat_fulltext -type release
+ pipenv run ./covid19_tool.py transform-es metadata/$(TODAY)/combined.fulltext.json | pv -l | esbulk -verbose -size 1000 -id fatcat_ident -w 8 -index covid19_fatcat_fulltext -type release
.PHONY: daily-update
-daily-update: load-elasticsearch ## Command to run every day: fetch corpus, load to elasticsearch
+daily-update: load-es ## Command to run every day: fetch corpus, load to elasticsearch