From 66ff1990d81bfc461c4cbbcc46278f785c4f273c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 23:11:47 -0700 Subject: add MAG importer; reorder directory class listing --- chocula/directories/__init__.py | 23 +++++++++------- chocula/directories/mag.py | 60 +++++++++++++++++++++++++++++++++++++++++ sources.toml | 5 ++++ tests/files/ISSN-to-ISSN-L.txt | 7 +++++ tests/files/mag-journals.txt | 25 +++++++++++++++++ 5 files changed, 110 insertions(+), 10 deletions(-) create mode 100644 chocula/directories/mag.py create mode 100644 tests/files/mag-journals.txt diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py index 17329e1..ed306c0 100644 --- a/chocula/directories/__init__.py +++ b/chocula/directories/__init__.py @@ -18,26 +18,29 @@ from chocula.directories.vanished_inactive import VanishedInactiveLoader from chocula.directories.issn_meta import IssnMetaLoader from chocula.directories.australian_era import AustralianEraLoader from chocula.directories.awol import AwolLoader +from chocula.directories.mag import MagLoader +# sort order roughly results in metadata prioritization ALL_CHOCULA_DIR_CLASSES = [ - CrossrefLoader, + IssnMetaLoader, + ManualHomepageLoader, + ScieloLoader, DoajLoader, + CrossrefLoader, EntrezLoader, EzbLoader, GoldOALoader, NorwegianLoader, + AustralianEraLoader, + SzczepanskiLoader, + WikidataLoader, + AwolLoader, + VanishedDisapearedLoader, + VanishedInactiveLoader, OpenAPCLoader, RoadLoader, SherpaRomeoLoader, - SzczepanskiLoader, - WikidataLoader, SimLoader, - ScieloLoader, - ManualHomepageLoader, ZdbFizeLoader, - VanishedDisapearedLoader, - VanishedInactiveLoader, - IssnMetaLoader, - AustralianEraLoader, - AwolLoader, + MagLoader, ] diff --git a/chocula/directories/mag.py b/chocula/directories/mag.py new file mode 100644 index 0000000..dbbc324 --- /dev/null +++ b/chocula/directories/mag.py @@ -0,0 +1,60 @@ +from typing import Iterable, Optional +import csv + +from chocula.util import clean_str, clean_issn +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class MagLoader(DirectoryLoader): + """ + TSV Columns (from schema docs): + + 1 JournalId long PRIMARY KEY + 2 Rank uint See FAQ + 3 NormalizedName string + 4 DisplayName string + 5 Issn string + 6 Publisher string + 7 Webpage string + 8 PaperCount long + 9 PaperFamilyCount long See FAQ + 10 CitationCount long + 11 CreatedDate DateTime + + """ + + source_slug = "mag" + + def open_file(self) -> Iterable: + return csv.DictReader( + open(self.config.mag.filepath, "r"), + delimiter="\t", + fieldnames=[ + "JournalId", + "Rank", + "NormalizedName", + "DisplayName", + "Issn", + "Publisher", + "Webpage", + "PaperCount", + "PaperFamilyCount", + "CitationCount", + "CreatedDate", + ], + ) + + def parse_record(self, record) -> Optional[DirectoryInfo]: + info = DirectoryInfo( + directory_slug=self.source_slug, + raw_issn=clean_issn(record["Issn"]), + custom_id=record["JournalId"], + name=clean_str(record["DisplayName"]), + publisher=clean_str(record["Publisher"]), + ) + homepage = HomepageUrl.from_url(record["Webpage"] or "") + if homepage: + info.homepage_urls.append(homepage) + + return info diff --git a/sources.toml b/sources.toml index e44d85f..9a24420 100644 --- a/sources.toml +++ b/sources.toml @@ -168,3 +168,8 @@ filename = "awol-index-top-issn.json" original_url = "https://isaw.nyu.edu/publications/awol-index/" mirror_url = "https://archive.org/download/awol-index-json" +[mag] +date = "2020-01-23" +original_url = "https://docs.microsoft.com/en-us/academic-services/graph/" +filename = "mag-journals.txt" +mirror_url = "https://archive.org/download/mag-2020-01-23" diff --git a/tests/files/ISSN-to-ISSN-L.txt b/tests/files/ISSN-to-ISSN-L.txt index e62a880..655570a 100644 --- a/tests/files/ISSN-to-ISSN-L.txt +++ b/tests/files/ISSN-to-ISSN-L.txt @@ -337,3 +337,10 @@ ISSN ISSN-L 0258-8315 0258-8315 2048-7177 2048-7177 0011-1287 0011-1287 +1882-7616 1882-7616 +1948-6553 1948-6553 +0036-4355 0036-4355 +1609-3321 1609-3321 +8756-4629 8756-4629 +1648-6897 1648-6897 +0030-9648 0030-9648 diff --git a/tests/files/mag-journals.txt b/tests/files/mag-journals.txt new file mode 100644 index 0000000..8148bba --- /dev/null +++ b/tests/files/mag-journals.txt @@ -0,0 +1,25 @@ +465895 12312 eureka Eureka 1342-5641 http://www.archim.org.uk/eureka/ 1106 168 2016-06-24 +1137746 11759 the artist and journal of home culture The Artist and Journal of Home Culture 2151-4879 518 8675 2016-06-24 +2978343 15234 cumberland law review Cumberland Law Review 0360-8298 http://www.cumberlandlawreview.com/ 47 44 2016-06-24 +3010151 14881 comparative medicine east and west Comparative Medicine East and West 0147-2917 19 122 2016-06-24 +3164724 9968 physiological measurement Physiological Measurement 0967-3334 3276 69046 2016-06-24 +17807283 9520 theoretical population biology Theoretical Population Biology 0040-5809 http://www.journals.elsevier.com/theoretical-population-biology/ 2478 107627 2016-06-24 +18204665 9332 international journal of multiphase flow International Journal of Multiphase Flow 0301-9322 4818 124251 2016-06-24 +27908409 11407 acta veterinaria hungarica Acta Veterinaria Hungarica 0236-6290 1727 11463 2016-06-24 +40275224 12875 revista latinoamericana de quimica Revista Latinoamericana de Química 0370-5943 http://www.relaquim.com/ 557 815 2016-06-24 +42124886 9771 scandinavian journal of rheumatology Scandinavian Journal of Rheumatology 0300-9742 4499 77427 2016-06-24 +2739456633 13593 derim Derim 2149-2182 308 98 2017-07-31 +2764395831 13860 the naham management journal The NAHAM management journal 1057-3526 229 25 2017-10-27 +49789000 9765 fatigue & fracture of engineering materials & structures Fatigue & Fracture of Engineering Materials & Structures 1460-2695 3983 65247 2016-06-24 +2736806674 12364 sequencia estudos juridicos e politicos Sequência: Estudos Juridicos e Politicos 0101-9562 949 240 2017-07-31 +2764944378 13733 journal of sport psychology in action Journal of sport psychology in action 2152-0712 220 956 2017-10-27 +2789086 13552 japanese dental science review Japanese Dental Science Review 1882-7616 212 1672 2016-06-24 +26559856 15149 obesity and weight management Obesity and Weight Management 1948-6553 55 119 2016-06-24 +2755455544 11243 sangre Sangre 0036-4355 3038 2091 2017-09-25 +2764925653 12051 applied journal of general practice Applied Journal of General Practice 1465 29 2017-10-27 +28565804 11453 moscow mathematical journal Moscow Mathematical Journal 1609-3321 http://www.ams.org/distribution/mmj/ 668 8844 2016-06-24 +26007720 14383 journal of geriatric drug therapy Journal of Geriatric Drug Therapy 8756-4629 99 247 2016-06-24 +117080907 11805 journal of the korean society for library and information science Journal of The Korean Society for Library and Information Science 1631 1458 2016-06-24 +62893890 12653 journal of environmental engineering and landscape management Journal of Environmental Engineering and Landscape Management 1648-6897 526 3649 2016-06-24 +2764717979 14185 journal of basic research in medical sciences Journal of Basic Research in Medical Sciences 170 56 2017-10-27 +2739194844 12343 pakistan armed forces medical journal Pakistan Armed Forces Medical Journal 0030-9648 1088 90 2017-07-31 -- cgit v1.2.3