extra/onix_xml_to_csv.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

#!/usr/bin/env python3

"""
This is a helper script to convert ONIX XML (eg, from Scholar's Portal) to Onix CSV format.

Rough XML schema:

    oph:HoldingsList
      oph:HoldingsRecord (many)
        oph:NotificationType text "00" (?)
        oph:ResourceVersion
            oph:ResourceVersionIdentifier
                oph:ResourceVersionIDType "07" is ISSN?
                oph:IDValue
            oph:Title
                oph:TitleText journal title
            oph:Publisher
                oph:PublisherName publisher name
            oph:OnlinePackage
                oph:PackageDetail (multiple)
                    oph:Coverage
                        oph:CoverageDescriptionLevel "03"
                        oph:FixedCoverage
                            oph:Release (multiple)
                                oph:Enumeration
                                    oph:Level1
                                        oph:Unit "Volume"
                                        oph:Number volume number
                                    oph:Level2
                                        oph:Unit "Issue"
                                        oph:Number issue number
                                    oph:NominalDate
                                        oph:DateFormat "01"
                                        oph:Date partial date, eg "197803"
    
                oph:PreservationStatus
                    oph:PreservationStatusCode "05"
                oph:VerificationStatus "01"

ONIX CSV columns:

    ISSN
    Title
    Publisher
    Url
    Vol
    No
    Published
    Deposited
"""

import sys
import csv
import argparse
import xml.etree.ElementTree as ET
from typing import List, Any, Dict, AnyStr, Optional


oph = "{http://www.editeur.org/onix/serials/SOH}"
xml_ns = {
    "oph": "http://www.editeur.org/onix/serials/SOH",
}

def fix_issn(raw: str) -> Optional[str]:
    if len(raw) == 9:
        return raw
    if len(raw) == 8:
        return f"{raw[:4]}-{raw[4:8]}"
    return None

def fix_date(raw: str) -> str:
    if len(raw) == 6 and raw.isdigit():
        return f"{raw[:4]}-{raw[4:6]}"
    return raw

def fix_str(raw: Optional[str]) -> Optional[str]:
    if not raw:
        return None
    return raw.strip().replace('\n', ' ')

def resource_to_rows(resource) -> List[dict]:
    rows = []
    base = dict()
    if resource.find('oph:ResourceVersionIdentifier/oph:ResourceVersionIDType', xml_ns).text != "07":
        return []
    base['ISSN'] = fix_issn(resource.find('oph:ResourceVersionIdentifier/oph:IDValue', xml_ns).text)
    base['Title'] = fix_str(resource.find('oph:Title/oph:TitleText', xml_ns).text)
    if not base['Title']:
        return []
    base['Publisher'] = fix_str(resource.find('oph:Publisher/oph:PublisherName', xml_ns).text)
    base['Url'] = ''
    base['Deposited'] = ''
    for package in resource.findall('oph:OnlinePackage/oph:PackageDetail', xml_ns):
        if package.find('oph:Coverage/oph:CoverageDescriptionLevel', xml_ns).text != "03":
            continue
        if package.find('oph:VerificationStatus', xml_ns).text != "01":
            continue
        if package.find('oph:PreservationStatus/oph:PreservationStatusCode', xml_ns).text != "05":
            continue
        for release in package.findall('oph:Coverage/oph:FixedCoverage/oph:Release', xml_ns):
            row = dict()
            if release.find('oph:Enumeration/oph:Level1/oph:Unit', xml_ns).text != "Volume":
                continue
            row['Vol'] = release.find('oph:Enumeration/oph:Level1/oph:Number', xml_ns).text
            if release.find('oph:Enumeration/oph:Level2/oph:Unit', xml_ns).text != "Issue":
                continue
            row['No'] = release.find('oph:Enumeration/oph:Level2/oph:Number', xml_ns).text
            if release.find('oph:NominalDate/oph:Date', xml_ns) == None:
                continue
            row['Published'] = fix_date(release.find('oph:NominalDate/oph:Date', xml_ns).text)
            row.update(base)
            rows.append(row)
    return rows

def onix_xml_to_csv(xml_input_file, csv_output_file):

    elem_iter = ET.iterparse(xml_input_file, ["start", "end"])

    fieldnames = ['ISSN', 'Title', 'Publisher', 'Url', 'Vol', 'No', 'Published', 'Deposited']
    writer = csv.DictWriter(csv_output_file, fieldnames=fieldnames)
    writer.writeheader()

    root = None
    for (event, element) in elem_iter:
        #print(element, file=sys.stderr)
        if not root and event == "start":
            root = element
            continue
        if not (element.tag == f"{oph}ResourceVersion" and event == "end"):
            continue
        for row in resource_to_rows(element):
            writer.writerow(row)
        element.clear()
        root.clear()

def main() -> None:  # pragma no cover
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="ONIX XML to JSON",
        usage="%(prog)s <xml_input_file> <csv_output_file>",
    )
    parser.add_argument("xml_input_file", type=argparse.FileType('r'))
    parser.add_argument("csv_output_file", type=argparse.FileType('w'))

    args = parser.parse_args()

    onix_xml_to_csv(args.xml_input_file, args.csv_output_file)


if __name__ == "__main__":  # pragma no cover
    main()