package skate import ( "fmt" "regexp" "strconv" "strings" "sync" "git.archive.org/martin/cgraph/skate/isbn" "git.archive.org/martin/cgraph/skate/set" ) var ( isbn10Regex = regexp.MustCompile(`[O0-9xX -]{10,18}`) isbn13Regex = regexp.MustCompile(`9[O0-9xX -]{12,20}`) rune16pool = sync.Pool{ New: func() interface{} { return make([]rune, 0, 16) }, } setPool = sync.Pool{ New: func() interface{} { return set.New() }, } ) // RefToRelease converts a ref to a release. Set a extra.skate.status flag to // be able to distinguish converted entities later. func RefToRelease(ref *Ref) (*Release, error) { var ( release Release b = ref.Biblio contribs = make([]struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` }, len(b.ContribRawNames)) ) release.Ident = ref.ReleaseIdent release.WorkID = ref.WorkIdent release.ExtIDs.Arxiv = b.ArxivId release.ExtIDs.DOI = b.DOI release.ExtIDs.PMID = b.PMID release.ExtIDs.PMCID = b.PMCID release.Title = b.Title release.Publisher = b.Publisher release.ContainerName = b.ContainerName release.Volume = b.Volume release.Issue = b.Issue release.Pages = b.Pages if ref.ReleaseYear > 1000 { release.ReleaseYearValue = fmt.Sprintf("%d", ref.ReleaseYear) } for i, name := range b.ContribRawNames { contribs[i].Index = i contribs[i].RawName = name } release.Contribs = contribs if strings.Contains(strings.ToLower(ref.Biblio.Unstructured), "isbn") { release.ExtIDs.ISBN = parseIsbn(ref.Biblio.Unstructured) } return &release, nil } // parseIsbn tries to find and validate ISBN from unstructured data. func parseIsbn(s string) []string { // ISBN: 10: 0137822693, pp: 373 // Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec, // ISBN9652910333. Shannon, C. (1948). A Mathematical Theory of // Communication. The Bell System Technical Journal. July; October, // Vol. 27, pp. 379-423; 623-656. // Artech House, ISBN: 978-1-60807-201-9, 2011. // ... var ( candidates10 = isbn10Regex.FindAllString(s, -1) candidates13 = isbn13Regex.FindAllString(s, -1) u []rune z string ) valid := setPool.Get().(set.Set) valid.Clear() defer setPool.Put(valid) for _, v := range append(candidates10, candidates13...) { u = rune16pool.Get().([]rune) u = u[:0] for _, c := range v { if c == 'O' { c = '0' } if c >= '0' && c <= '9' || c == 'x' || c == 'X' { u = append(u, c) } } z = string(u) rune16pool.Put(u) if !isbn.Validate(z) { continue } if len(z) < 12 { w, err := isbn.To13(z) if err != nil { continue } valid.Add(w) } else { valid.Add(z) } } return valid.Slice() } // Ref is a reference document, can be very partial. type Ref struct { Biblio struct { ArxivId string `json:"arxiv_id,omitempty"` ContainerName string `json:"container_name,omitempty"` ContribRawNames []string `json:"contrib_raw_names,omitempty"` DOI string `json:"doi,omitempty"` Issue string `json:"issue,omitempty"` PMCID string `json:"pmcid,omitempty"` PMID string `json:"pmid,omitempty"` Pages string `json:"pages,omitempty"` Publisher string `json:"publisher,omitempty"` Title string `json:"title,omitempty"` Unstructured string `json:"unstructured,omitempty"` Url string `json:"url,omitempty"` Volume string `json:"volume,omitempty"` Year int64 `json:"year,omitempty"` } `json:"biblio"` Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` RefSource string `json:"ref_source,omitempty"` ReleaseYear int `json:"release_year,omitempty"` ReleaseIdent string `json:"release_ident,omitempty"` ReleaseStage string `json:"release_stage,omitempty"` WorkIdent string `json:"work_ident,omitempty"` } // Release document. Note that we may have varying types for some fields. // Mitigation for now is to make the field an interface{}, name the field // "...Value" and to add a method with the field name, doing type assertion. // Example: ReleaseYearValue interface{}, ReleaseYear() int, etc. type Release struct { ContainerID string `json:"container_id,omitempty"` ContainerName string `json:"container_name,omitempty"` Contribs []struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` } `json:"contribs,omitempty"` ExtIDs struct { Arxiv string `json:"arxiv,omitempty"` Core string `json:"core,omitempty"` DOI string `json:"doi,omitempty"` ISBN []string `json:"isbn,omitempty"` // should be isbn13 Jstor string `json:"jstor,omitempty"` OLID string `json:"olid,omitempty"` PMCID string `json:"pmcid,omitempty"` PMID string `json:"pmid,omitempty"` WikidataQID string `json:"wikidata_qid,omitempty"` } `json:"ext_ids,omitempty"` Ident string `json:"ident,omitempty"` Publisher string `json:"publisher,omitempty"` Refs []struct { ContainerName string `json:"container_name,omitempty"` Extra struct { DOI string `json:"doi,omitempty"` Authors []string `json:"authors,omitempty"` Key string `json:"key,omitempty"` Year string `json:"year,omitempty"` Locator string `json:"locator,omitempty"` Volume string `json:"volume,omitempty"` } `json:"extra"` Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` Locator string `json:"locator,omitempty"` Year int64 `json:"year,omitempty"` } `json:"refs,omitempty"` ReleaseDate string `json:"release_date,omitempty"` ReleaseYearValue interface{} `json:"release_year,omitempty"` // might be int or str ReleaseStage string `json:"release_stage,omitempty"` ReleaseType string `json:"release_type,omitempty"` Issue string `json:"issue,omitempty"` Volume string `json:"volume,omitempty"` Pages string `json:"pages,omitempty"` Title string `json:"title,omitempty"` WorkID string `json:"work_id,omitempty"` Extra struct { ContainerName string `json:"container_name,omitempty"` SubtitleValue interface{} `json:"subtitle,omitempty"` // []str or str Crossref struct { Type string `json:"type,omitempty"` } `json:"crossref,omitempty"` DataCite struct { MetadataVersion int `json:"metadataVersion,omitempty"` Relations []DataCiteRelation `json:"relations,omitempty"` } `json:"datacite,omitempty"` Skate struct { // Mark as converted (e.g. by setting status to "ref") Status string `json:"status,omitempty"` // Carry the ref index and key around. Ref struct { Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` Locator string `json:"locator,omitempty"` } `json:"ref,omitempty"` ResearchGate struct { URL string `json:"url,omitempty"` } `json:"rg,omitempty"` } `json:"skate,omitempty"` OpenLibrary struct { HasFulltext bool `json:"has_fulltext,omitempty"` } `json:"ol,omitempty"` } `json:"extra,omitempty"` } // Subtitle returns a slice of subtitle strings. func (r *Release) Subtitle() (result []string) { switch v := r.Extra.SubtitleValue.(type) { case []interface{}: for _, e := range v { result = append(result, fmt.Sprintf("%v", e)) } return result case []string: return v case string: return []string{v} } return []string{} } // ReleaseYearString returns release year as string. func (r *Release) ReleaseYearString() string { return fmt.Sprintf("%d", r.ReleaseYear()) } // ReleaseYear returns year as int, no further validity checks. func (r *Release) ReleaseYear() int { switch v := r.ReleaseYearValue.(type) { case int: return v case float64: return int(v) case string: w, err := strconv.Atoi(v) if err != nil { return 0 } return w default: return 0 } } // DataCiteRelation as it appears in the release extra field. type DataCiteRelation struct { RelatedIdentifierType string `json:"relatedIdentifierType,omitempty"` RelatedIdentifierValue interface{} `json:"relatedIdentifier,omitempty"` } // RelatedIdentifier returns the identifier as string. func (r *DataCiteRelation) RelatedIdentifier() string { switch v := r.RelatedIdentifierValue.(type) { case string: return v default: return fmt.Sprintf("%v", v) } } // Sitemap basic JSON style, e.g. for https://archive.org/details/rg_sitemap_2021_02_23. type Sitemap struct { Lastmod string `json:"lastmod,omitempty"` Title string `json:"title,omitempty"` URL string `json:"url,omitempty"` } // BiblioRef as a prototype for indexing, https://is.gd/yicTom. type BiblioRef struct { Key string `json:"_id,omitempty"` IndexedTs string `json:"indexed_ts,omitempty"` // https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html SourceReleaseIdent string `json:"source_release_ident,omitempty"` SourceWorkIdent string `json:"source_work_ident,omitempty"` SourceWikipediaArticle string `json:"source_wikipedia_article,omitempty"` SourceReleaseStage string `json:"source_release_stage,omitempty"` SourceYear string `json:"source_year,omitempty"` RefIndex int64 `json:"ref_index,omitempty"` // 1-based RefKey string `json:"ref_key,omitempty"` RefLocator string `json:"ref_locator,omitempty"` TargetReleaseIdent string `json:"target_release_ident,omitempty"` TargetWorkIdent string `json:"target_work_ident,omitempty"` TargetOpenLibraryWork string `json:"target_openlibrary_work,omitempty"` TargetURLSurt string `json:"target_url_surt,omitempty"` TargetURL string `json:"target_url,omitempty"` MatchProvenance string `json:"match_provenance,omitempty"` MatchStatus string `json:"match_status,omitempty"` MatchReason string `json:"match_reason,omitempty"` TargetUnstructured string `json:"target_unstructured,omitempty"` TargetCSL string `json:"target_csl,omitempty"` } // ReleaseCluster, a list of match candidates. This is typically serialized as a // single JSON line. type ReleaseCluster struct { Key string `json:"k"` Values []*Release `json:"v"` } // OneNonRef returns the first non-reference release found in a cluster, or an // error, if none has been found. This depends on converted references setting // extra.skate.status to "ref" - we use this in mixed clusters (catalog entries // and references converted into releases). func (rc *ReleaseCluster) OneNonRef() (*Release, error) { for _, re := range rc.Values { if re.Extra.Skate.Status != "ref" { return re, nil } } return nil, fmt.Errorf("no reference/release found for cluster key: %v", rc.Key) } // MinimalCitations variant from archive.org/details/wikipedia_citations_2020-07-14. type MinimalCitations struct { IDList string `json:"ID_list"` PageTitle string `json:"page_title"` Title string `json:"Title"` TypeOfCitation string `json:"type_of_citation"` } // IDList with commonly used identifier from wikipedia citations. type IDList struct { ISBN string `json:"isbn,omitempty"` DOI string `json:"doi,omitempty"` PMID string `json:"pmid,omitempty"` ISSN string `json:"issn,omitempty"` JSTOR string `json:"jstor,omitempty"` PMC string `json:"pmc,omitempty"` ARXIV string `json:"arxiv,omitempty"` OL string `json:"ol,omitempty"` } // IsZero returns true, if none of the identifiers is defined. func (l *IDList) IsZero() bool { return *l == IDList{} } // ParseIDList parses out the identifiers from a citation document, the IDList // values look something like this: "{BIBCODE=1992ApJ...399L..31C, // DOI=10.1086/186599}". func (c *MinimalCitations) ParseIDList() (result IDList) { if len(c.IDList) < 3 { return result } var ( s = c.IDList[1 : len(c.IDList)-1] // get rid of "{" and "}" parts = strings.Split(s, ",") pair []string ) for _, part := range parts { pair = strings.Split(part, "=") if len(pair) != 2 { continue } pair[0] = strings.TrimSpace(pair[0]) pair[1] = strings.TrimSpace(pair[1]) switch pair[0] { case "ISBN": result.ISBN = pair[1] case "DOI": result.DOI = pair[1] case "PMID": result.PMID = pair[1] case "ISSN": result.ISSN = pair[1] case "PMC": result.PMC = pair[1] case "JSTOR": result.JSTOR = pair[1] case "ARXIV": result.ARXIV = pair[1] case "OL": result.OL = pair[1] default: continue } } return result } // OpenLibraryWork from data dump (solr). type OpenLibraryWork struct { AuthorFacet []string `json:"author_facet"` AuthorKey []string `json:"author_key"` AuthorName []string `json:"author_name"` CoverEditionKey string `json:"cover_edition_key"` CoverI int64 `json:"cover_i"` EbookCountI int64 `json:"ebook_count_i"` EditionCount int64 `json:"edition_count"` EditionKey []string `json:"edition_key"` FirstPublishYear int64 `json:"first_publish_year"` HasFulltext bool `json:"has_fulltext"` IdGoodreads []string `json:"id_goodreads"` IdLibrarything []string `json:"id_librarything"` Isbn []string `json:"isbn"` Key string `json:"key"` Language []string `json:"language"` LastModifiedI int64 `json:"last_modified_i"` PublishDate []string `json:"publish_date"` PublishYear []int64 `json:"publish_year"` Publisher []string `json:"publisher"` PublisherFacet []string `json:"publisher_facet"` Seed []string `json:"seed"` Text []string `json:"text"` Title string `json:"title"` TitleSuggest []string `json:"title_suggest"` Type string `json:"type"` Version int64 `json:"_version_"` } // OpenLibraryToRelease convert OL data into a release. XXX: release/work? func OpenLibraryToRelease(w *OpenLibraryWork) (*Release, error) { var ( release Release contribs = make([]struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` }, len(w.AuthorName)) s = set.New() ) for i, author := range w.AuthorName { contribs[i].RawName = author } release.Title = w.Title if len(w.PublishYear) > 0 { release.ReleaseYearValue = w.FirstPublishYear } for _, v := range w.Isbn { switch { case len(v) < 13: if w, err := isbn.To13(v); err == nil { s.Add(w) } default: s.Add(v) } } if len(w.Publisher) > 0 { release.Publisher = w.Publisher[0] } release.ExtIDs.ISBN = s.Slice() release.ExtIDs.OLID = strings.Replace(w.Key, "/works/", "", 1) release.Extra.OpenLibrary.HasFulltext = w.HasFulltext return &release, nil }