1 files changed, 46 insertions, 5 deletions
diff --git a/extra/fixups/fixup_longtail_issnl_unique.py b/extra/fixups/fixup_longtail_issnl_unique.py
index 2493a332..385b0e29 100755
--- a/extra/fixups/fixup_longtail_issnl_unique.py
+++ b/extra/fixups/fixup_longtail_issnl_unique.py
@@ -15,7 +15,38 @@ See also:
 - https://archive.org/details/OA-JOURNAL-TESTCRAWL-TWO-2018-extra
 = https://archive.org/download/ia_longtail_dumpgrobidmetainsertable_2018-09-23/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz
 
-
+QA notes:
+
+- everything on revistas.uv.mx linked to 2395-9495, which is only one journal
+  on that domain. blacklist 'revistas' in the domain?
+- afjg3yjdjbf2dad47t5jq7nlbe => 2305-7254 ok match but not perfect (wrong year
+  of conference). probably better than nothing. 
+- elib.mi.sanu.ac.rs has several journals on domain. container_name was correct.
+- revistavirtual.ucn.edu.co has 2x journals
+- lpchkxkp5jecdgrab33fxodd7y bad match
+- k36web33jvf25by64gop4yil7q an IR, not a journal (ok)
+- hvxercwasjhotpewb5xfadyyle good match, though only an abstract (in URL). full
+  articles get DOIs
+- release_epkiok6y3zhsnp3no2lkljznza not a paper; journal match batch (cato, wtf)
+- release_b3jolh25mbg4djrqotgosyeike jfr.unibo.it good
+- release_bzr35evb4bdd3mxex6gxn6dcyy conf.ostis.net good?
+- uzspace.uzulu.ac.za IR, not a container
+- release_5lt36yy3vre2nnig46toy67kdi wrong, multiple journals
+- release_54hmv5gvtjghjk7rpcbp2pn2ky good
+- release_6h7doxfaxnao3jm7f6jkfdpdwm good
+- release_6pio5hz6bvawfnodhkvmfk4jei correct but stub
+- release_7oobqygqczapbgdvvgbxfyvqli correct
+- release_tsljmbevpzfpxiezzv7puwbilq good
+
+general notes:
+- GROBID works pretty well. references look pretty good, should match. there is
+  a non-trivial fraction of non-journal content, but it isn't too bad
+- this "single-journal domain" premise doesn't work
+- could probably do a subset based on "is the journal name in the domain name",
+  or "is domain acronym of journal name"
+- surprising number of IRs with ISSNs in here
+- might have better luck blacklisting out latin american TLDs, which tend to
+  host many journals?
 """
 
 import os, sys, argparse
@@ -101,16 +132,23 @@ class LongtailIssnlSingleDomainFixup(EntityImporter):
         url = cdx_dict['url']
         domain = url.split('/')[2].lower()
 
+        if not domain:
+            self.counts['skip-domain-blank'] += 1
+            return None
+
         # domain in scope?
         issnl = self._domain_issnl_map.get(domain)
         if not issnl:
             self.counts['skip-domain-scope'] += 1
             return None
+        if 'revistas' in domain.lower().split('.'):
+            self.counts['skip-domain-revistas'] += 1
+            return None
 
         # lookup file
         #print(sha1)
         try:
-            file_entity = self.api.lookup_file(sha1=sha1)
+            file_entity = self.api.lookup_file(sha1=sha1, expand="releases")
         except fatcat_client.rest.ApiException as err:
             if err.status == 404:
                 self.counts['skip-file-not-found'] += 1
@@ -136,12 +174,15 @@ class LongtailIssnlSingleDomainFixup(EntityImporter):
             return None
 
         # fetch releases
-        releases = self.api.get_file_releases(file_entity.ident)
-        releases = [r for r in releases if (r.extra.get('longtail-oa') == True and r.container_id == None)]
+        releases = [r for r in file_entity.releases if (r.extra.get('longtail_oa') == True and r.container_id == None)]
         if not releases:
+            #print(file_entity.releases)
             self.counts['skip-no-releases'] += 1
             return None
 
+        # fetch full release objects (need abstract, etc, for updating)
+        releases = [self.api.get_release(r.ident) for r in releases]
+
         # set container_id
         for r in releases:
             r.container_id = container_id
@@ -149,7 +190,7 @@ class LongtailIssnlSingleDomainFixup(EntityImporter):
 
     def try_update(self, re_list):
         for re in re_list:
-            self.api.update_release(re.ident, re, editgroup_id=self.get_editgroup_id())
+            self.api.update_release(self.get_editgroup_id(), re.ident, re)
             self.counts['update'] += 1
         return False