diff options
Diffstat (limited to 'extra')
-rwxr-xr-x | extra/fixups/fixup_longtail_issnl_unique.py | 51 |
1 files changed, 46 insertions, 5 deletions
diff --git a/extra/fixups/fixup_longtail_issnl_unique.py b/extra/fixups/fixup_longtail_issnl_unique.py index 2493a332..385b0e29 100755 --- a/extra/fixups/fixup_longtail_issnl_unique.py +++ b/extra/fixups/fixup_longtail_issnl_unique.py @@ -15,7 +15,38 @@ See also: - https://archive.org/details/OA-JOURNAL-TESTCRAWL-TWO-2018-extra = https://archive.org/download/ia_longtail_dumpgrobidmetainsertable_2018-09-23/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz - +QA notes: + +- everything on revistas.uv.mx linked to 2395-9495, which is only one journal + on that domain. blacklist 'revistas' in the domain? +- afjg3yjdjbf2dad47t5jq7nlbe => 2305-7254 ok match but not perfect (wrong year + of conference). probably better than nothing. +- elib.mi.sanu.ac.rs has several journals on domain. container_name was correct. +- revistavirtual.ucn.edu.co has 2x journals +- lpchkxkp5jecdgrab33fxodd7y bad match +- k36web33jvf25by64gop4yil7q an IR, not a journal (ok) +- hvxercwasjhotpewb5xfadyyle good match, though only an abstract (in URL). full + articles get DOIs +- release_epkiok6y3zhsnp3no2lkljznza not a paper; journal match batch (cato, wtf) +- release_b3jolh25mbg4djrqotgosyeike jfr.unibo.it good +- release_bzr35evb4bdd3mxex6gxn6dcyy conf.ostis.net good? +- uzspace.uzulu.ac.za IR, not a container +- release_5lt36yy3vre2nnig46toy67kdi wrong, multiple journals +- release_54hmv5gvtjghjk7rpcbp2pn2ky good +- release_6h7doxfaxnao3jm7f6jkfdpdwm good +- release_6pio5hz6bvawfnodhkvmfk4jei correct but stub +- release_7oobqygqczapbgdvvgbxfyvqli correct +- release_tsljmbevpzfpxiezzv7puwbilq good + +general notes: +- GROBID works pretty well. references look pretty good, should match. there is + a non-trivial fraction of non-journal content, but it isn't too bad +- this "single-journal domain" premise doesn't work +- could probably do a subset based on "is the journal name in the domain name", + or "is domain acronym of journal name" +- surprising number of IRs with ISSNs in here +- might have better luck blacklisting out latin american TLDs, which tend to + host many journals? """ import os, sys, argparse @@ -101,16 +132,23 @@ class LongtailIssnlSingleDomainFixup(EntityImporter): url = cdx_dict['url'] domain = url.split('/')[2].lower() + if not domain: + self.counts['skip-domain-blank'] += 1 + return None + # domain in scope? issnl = self._domain_issnl_map.get(domain) if not issnl: self.counts['skip-domain-scope'] += 1 return None + if 'revistas' in domain.lower().split('.'): + self.counts['skip-domain-revistas'] += 1 + return None # lookup file #print(sha1) try: - file_entity = self.api.lookup_file(sha1=sha1) + file_entity = self.api.lookup_file(sha1=sha1, expand="releases") except fatcat_client.rest.ApiException as err: if err.status == 404: self.counts['skip-file-not-found'] += 1 @@ -136,12 +174,15 @@ class LongtailIssnlSingleDomainFixup(EntityImporter): return None # fetch releases - releases = self.api.get_file_releases(file_entity.ident) - releases = [r for r in releases if (r.extra.get('longtail-oa') == True and r.container_id == None)] + releases = [r for r in file_entity.releases if (r.extra.get('longtail_oa') == True and r.container_id == None)] if not releases: + #print(file_entity.releases) self.counts['skip-no-releases'] += 1 return None + # fetch full release objects (need abstract, etc, for updating) + releases = [self.api.get_release(r.ident) for r in releases] + # set container_id for r in releases: r.container_id = container_id @@ -149,7 +190,7 @@ class LongtailIssnlSingleDomainFixup(EntityImporter): def try_update(self, re_list): for re in re_list: - self.api.update_release(re.ident, re, editgroup_id=self.get_editgroup_id()) + self.api.update_release(self.get_editgroup_id(), re.ident, re) self.counts['update'] += 1 return False |