update script, notes

author: Martin Czygan <martin.czygan@gmail.com> 2021-06-21 20:03:45 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-06-21 20:03:45 +0200
commit: 1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b (patch)
tree: 05dad31420f1142f143209f0cfe36e2e8249f8a0
parent: e82983633b58ead6ed2ce82e36a17af227d5f5ed (diff)
download: refcat-1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b.tar.gz
refcat-1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b.zip
3 files changed, 29 insertions, 9 deletions
diff --git a/extra/cdx/cdx_query.py b/extra/cdx/cdx_query.py
index 9b7a1f0..b41ae53 100644
--- a/extra/cdx/cdx_query.py
+++ b/extra/cdx/cdx_query.py
@@ -9,31 +9,29 @@ import requests
 import sys
 
 Cdx = collections.namedtuple("Cdx", "surt date url mime status hash size")
+CdxApi = "http://web.archive.org/cdx/search/cdx"
+okStatus = ('200', '301', '302', '303')
 
 
 def parse_cdx_lines(blob):
     result = []
     lines = blob.split("\n")
-    cutoff = ["access", "Access", "abgeruf", "aufgeruf"]
     for line in lines:
-        for c in cutoff:
-            if c in line:
-                line = line[:line.index(c)]
-                break
         fields = line.strip().split()
         if len(fields) == 0:
             continue
         cdx = Cdx(*fields)
         result.append(cdx)
     result = sorted(result, key=lambda cdx: cdx.date, reverse=True)
-    result = list(filter(lambda cdx: cdx.status in ("200", "301", "302", "303"), result))
+    result = list(filter(lambda cdx: cdx.status in okStatus, result))
     return result
 
+
 def main():
     stats = collections.Counter()
     for line in fileinput.input():
         line = line.strip()
-        r = requests.get("http://web.archive.org/cdx/search/cdx?url={}".format(line))
+        r = requests.get("{}?url={}".format(CdxApi, line))
         if not r.ok:
             continue
         try:
@@ -49,9 +47,9 @@ def main():
             stats["ok"] += 1
             print("OK\t{}\t{}".format(cdx_lines[0].date, line))
 
-
     return stats
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     stats = main()
     print(stats)
diff --git a/python/notes/version_4.md b/python/notes/version_4.md
index e504b2a..2e273f8 100644
--- a/python/notes/version_4.md
+++ b/python/notes/version_4.md
@@ -821,3 +821,23 @@ all duplicates, e.g. when the indices are different, but the reference is
 actually the same.
 
 Would need to "uniq" tool for the whole ref blob or something like that.
+
+----
+
+## QA: duplicates
+
+There seem to be many self-links in the dataset:
+
+* sample: 25668733, duplicate rows: 1913155; about 8% (although only 145030 uniq; many repetitions)
+
+```
+$ LC_ALL=C awk '$1 == $2' bref_tabs.tsv # ....
+56fbxcue6rdxlmxqto7vibg2xi      56fbxcue6rdxlmxqto7vibg2xi      exact   doi     crossref
+o2juqzskxzdtpbait5gxg3yf4q      o2juqzskxzdtpbait5gxg3yf4q      exact   doi     crossref
+6mwdlhvbljgtdntz5qifywhsn4      6mwdlhvbljgtdntz5qifywhsn4      exact   doi     crossref
+t7vluqxmgbe4pipf4nkfcayedq      t7vluqxmgbe4pipf4nkfcayedq      exact   doi     crossref
+iofm6brptvczlnrys5vxw34x3i      iofm6brptvczlnrys5vxw34x3i      exact   doi     crossref
+soa44abzivcnfnsx4ymxvbyg44      soa44abzivcnfnsx4ymxvbyg44      exact   doi     crossref
+7fs4c3u2ofcmxie344o5e4wuxi      7fs4c3u2ofcmxie344o5e4wuxi      exact   doi     crossref
+igyewr6er5epfozhk7dyfqa5tu      igyewr6er5epfozhk7dyfqa5tu      exact   doi     crossref
+```
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index bf479f0..78ac68c 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1237,6 +1237,8 @@ class Bref(Refcat):
 class BrefSortedByWorkID(Refcat):
     """
     Sort by work id. 237m45.094s.
+
+    Final file currently has: 915168340 docs.
     """
     def requires(self):
         return Bref()
author	Martin Czygan <martin.czygan@gmail.com>	2021-06-21 20:03:45 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-06-21 20:03:45 +0200
commit	1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b (patch)
tree	05dad31420f1142f143209f0cfe36e2e8249f8a0
parent	e82983633b58ead6ed2ce82e36a17af227d5f5ed (diff)
download	refcat-1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b.tar.gz refcat-1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b.zip