package skate import ( "fmt" "strconv" "strings" ) // RefToRelease converts a ref to a release. Set a extra.skate.status flag to // be able to distinguish converted entities later. func RefToRelease(ref *Ref) (*Release, error) { var ( release Release b = ref.Biblio contribs = make([]struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` }, len(b.ContribRawNames)) ) release.Ident = ref.ReleaseIdent release.WorkID = ref.WorkIdent release.ExtIDs.Arxiv = b.ArxivId release.ExtIDs.DOI = b.DOI release.ExtIDs.PMID = b.PMID release.ExtIDs.PMCID = b.PMCID release.Title = b.Title release.Publisher = b.Publisher release.ContainerName = b.ContainerName release.Volume = b.Volume release.Issue = b.Issue release.Pages = b.Pages if ref.ReleaseYear > 1000 { release.ReleaseYearValue = fmt.Sprintf("%d", ref.ReleaseYear) } for i, name := range b.ContribRawNames { contribs[i].Index = i contribs[i].RawName = name } release.Contribs = contribs return &release, nil } // Ref is a reference document, can be very partial. type Ref struct { Biblio struct { ArxivId string `json:"arxiv_id,omitempty"` ContainerName string `json:"container_name,omitempty"` ContribRawNames []string `json:"contrib_raw_names,omitempty"` DOI string `json:"doi,omitempty"` Issue string `json:"issue,omitempty"` PMCID string `json:"pmcid,omitempty"` PMID string `json:"pmid,omitempty"` Pages string `json:"pages,omitempty"` Publisher string `json:"publisher,omitempty"` Title string `json:"title,omitempty"` Unstructured string `json:"unstructured,omitempty"` Url string `json:"url,omitempty"` Volume string `json:"volume,omitempty"` Year int64 `json:"year,omitempty"` } `json:"biblio"` Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` RefSource string `json:"ref_source,omitempty"` ReleaseYear int `json:"release_year,omitempty"` ReleaseIdent string `json:"release_ident,omitempty"` ReleaseStage string `json:"release_stage,omitempty"` WorkIdent string `json:"work_ident,omitempty"` } // Release document. Note that we may have varying types for some fields. // Mitigation for now is to make the field an interface{}, name the field // "...Value" and to add a method with the field name, doing type assertion. // Example: ReleaseYearValue interface{}, ReleaseYear() int, etc. type Release struct { ContainerID string `json:"container_id,omitempty"` ContainerName string `json:"container_name,omitempty"` Contribs []struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` } `json:"contribs,omitempty"` ExtIDs struct { DOI string `json:"doi,omitempty"` PMID string `json:"pmid,omitempty"` PMCID string `json:"pmcid,omitempty"` Arxiv string `json:"arxiv,omitempty"` Core string `json:"core,omitempty"` WikidataQID string `json:"wikidata_qid,omitempty"` Jstor string `json:"jstor,omitempty"` ISBN10 string `json:"isbn10,omitempty"` ISBN13 string `json:"isbn13,omitempty"` } `json:"ext_ids,omitempty"` Ident string `json:"ident,omitempty"` Publisher string `json:"publisher,omitempty"` Refs []struct { ContainerName string `json:"container_name,omitempty"` Extra struct { DOI string `json:"doi,omitempty"` Authors []string `json:"authors,omitempty"` Key string `json:"key,omitempty"` Year string `json:"year,omitempty"` Locator string `json:"locator,omitempty"` Volume string `json:"volume,omitempty"` } `json:"extra"` Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` Locator string `json:"locator,omitempty"` Year int64 `json:"year,omitempty"` } `json:"refs,omitempty"` ReleaseDate string `json:"release_date,omitempty"` ReleaseYearValue interface{} `json:"release_year,omitempty"` // might be int or str ReleaseStage string `json:"release_stage,omitempty"` ReleaseType string `json:"release_type,omitempty"` Issue string `json:"issue,omitempty"` Volume string `json:"volume,omitempty"` Pages string `json:"pages,omitempty"` Title string `json:"title,omitempty"` WorkID string `json:"work_id,omitempty"` Extra struct { ContainerName string `json:"container_name,omitempty"` SubtitleValue interface{} `json:"subtitle,omitempty"` // []str or str Crossref struct { Type string `json:"type,omitempty"` } `json:"crossref,omitempty"` DataCite struct { MetadataVersion int `json:"metadataVersion,omitempty"` Relations []DataCiteRelation `json:"relations,omitempty"` } `json:"datacite,omitempty"` Skate struct { // Mark as converted (e.g. by setting status to "ref") Status string `json:"status,omitempty"` // Carry the ref index and key around. Ref struct { Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` Locator string `json:"locator,omitempty"` } `json:"ref,omitempty"` ResearchGate struct { URL string `json:"url,omitempty"` } `json:"rg,omitempty"` } `json:"skate,omitempty"` } `json:"extra,omitempty"` } // Subtitle returns a slice of subtitle strings. func (r *Release) Subtitle() (result []string) { switch v := r.Extra.SubtitleValue.(type) { case []interface{}: for _, e := range v { result = append(result, fmt.Sprintf("%v", e)) } return result case []string: return v case string: return []string{v} } return []string{} } // ReleaseYearString returns release year as string. func (r *Release) ReleaseYearString() string { return fmt.Sprintf("%d", r.ReleaseYear()) } // ReleaseYear returns year as int, no further validity checks. func (r *Release) ReleaseYear() int { switch v := r.ReleaseYearValue.(type) { case int: return v case float64: return int(v) case string: w, err := strconv.Atoi(v) if err != nil { return 0 } return w default: return 0 } } // DataCiteRelation as it appears in the release extra field. type DataCiteRelation struct { RelatedIdentifierType string `json:"relatedIdentifierType,omitempty"` RelatedIdentifierValue interface{} `json:"relatedIdentifier,omitempty"` } // RelatedIdentifier returns the identifier as string. func (r *DataCiteRelation) RelatedIdentifier() string { switch v := r.RelatedIdentifierValue.(type) { case string: return v default: return fmt.Sprintf("%v", v) } } // Sitemap basic JSON style, e.g. for https://archive.org/details/rg_sitemap_2021_02_23. type Sitemap struct { Lastmod string `json:"lastmod,omitempty"` Title string `json:"title,omitempty"` URL string `json:"url,omitempty"` } // BiblioRef as a prototype for indexing, https://is.gd/yicTom. type BiblioRef struct { Key string `json:"_id,omitempty"` IndexedTs string `json:"indexed_ts,omitempty"` // XXX: maybe: "epoch_millis", https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html SourceReleaseIdent string `json:"source_release_ident,omitempty"` SourceWorkIdent string `json:"source_work_ident,omitempty"` SourceWikipediaArticle string `json:"source_wikipedia_article,omitempty"` SourceReleaseStage string `json:"source_release_stage,omitempty"` SourceYear string `json:"source_year,omitempty"` RefIndex int64 `json:"ref_index,omitempty"` // 1-based RefKey string `json:"ref_key,omitempty"` RefLocator string `json:"ref_locator,omitempty"` TargetReleaseIdent string `json:"target_release_ident,omitempty"` TargetWorkIdent string `json:"target_work_ident,omitempty"` TargetOpenLibraryWork string `json:"target_openlibrary_work,omitempty"` TargetURLSurt string `json:"target_url_surt,omitempty"` TargetURL string `json:"target_url,omitempty"` MatchProvenance string `json:"match_provenance,omitempty"` MatchStatus string `json:"match_status,omitempty"` MatchReason string `json:"match_reason,omitempty"` TargetUnstructured string `json:"target_unstructured,omitempty"` TargetCSL string `json:"target_csl,omitempty"` } // ReleaseCluster, a list of match candidates. This is typically serialized as a // single JSON line. type ReleaseCluster struct { Key string `json:"k"` Values []*Release `json:"v"` } // OneNonRef returns the first non-reference release found in a cluster, or an // error, if none has been found. This depends on converted references using // the status "ref" in extra. We use this in mixed clusters (catalog entries // and references converted into releases). func (rc *ReleaseCluster) OneNonRef() (*Release, error) { for _, re := range rc.Values { if re.Extra.Skate.Status != "ref" { return re, nil } } return nil, fmt.Errorf("no reference/release found for cluster key: %v", rc.Key) } // MinimalCitations variant from archive.org/details/wikipedia_citations_2020-07-14. type MinimalCitations struct { IDList string `json:"ID_list"` PageTitle string `json:"page_title"` Title string `json:"Title"` TypeOfCitation string `json:"type_of_citation"` } // IDList with commonly used identifier from wikipedia citations. type IDList struct { ISBN string `json:"isbn,omitempty"` DOI string `json:"doi,omitempty"` PMID string `json:"pmid,omitempty"` ISSN string `json:"issn,omitempty"` JSTOR string `json:"jstor,omitempty"` PMC string `json:"pmc,omitempty"` ARXIV string `json:"arxiv,omitempty"` OL string `json:"ol,omitempty"` } // IsZero returns true, if none of the identifiers is defined. func (l *IDList) IsZero() bool { return *l == IDList{} } // ParseIDList parses out the identifiers from a citation document, the IDList // values look something like this: "{BIBCODE=1992ApJ...399L..31C, // DOI=10.1086/186599}". func (c *MinimalCitations) ParseIDList() (result IDList) { if len(c.IDList) < 3 { return result } var ( s = c.IDList[1 : len(c.IDList)-1] // get rid of "{" and "}" parts = strings.Split(s, ",") pair []string ) for _, part := range parts { pair = strings.Split(part, "=") if len(pair) != 2 { continue } pair[0] = strings.TrimSpace(pair[0]) pair[1] = strings.TrimSpace(pair[1]) switch pair[0] { case "ISBN": result.ISBN = pair[1] case "DOI": result.DOI = pair[1] case "PMID": result.PMID = pair[1] case "ISSN": result.ISSN = pair[1] case "PMC": result.PMC = pair[1] case "JSTOR": result.JSTOR = pair[1] case "ARXIV": result.ARXIV = pair[1] case "OL": result.OL = pair[1] default: continue } } return result } // OpenLibraryWork from data dump (solr). type OpenLibraryWork struct { AuthorFacet []string `json:"author_facet"` AuthorKey []string `json:"author_key"` AuthorName []string `json:"author_name"` CoverEditionKey string `json:"cover_edition_key"` CoverI int64 `json:"cover_i"` EbookCountI int64 `json:"ebook_count_i"` EditionCount int64 `json:"edition_count"` EditionKey []string `json:"edition_key"` FirstPublishYear int64 `json:"first_publish_year"` HasFulltext bool `json:"has_fulltext"` IdGoodreads []string `json:"id_goodreads"` IdLibrarything []string `json:"id_librarything"` Isbn []string `json:"isbn"` Key string `json:"key"` Language []string `json:"language"` LastModifiedI int64 `json:"last_modified_i"` PublishDate []string `json:"publish_date"` PublishYear []int64 `json:"publish_year"` Publisher []string `json:"publisher"` PublisherFacet []string `json:"publisher_facet"` Seed []string `json:"seed"` Text []string `json:"text"` Title string `json:"title"` TitleSuggest []string `json:"title_suggest"` Type string `json:"type"` Version int64 `json:"_version_"` } // OpenLibraryToRelease convert OL data into a release. XXX: release/work? func OpenLibraryToRelease(w *OpenLibraryWork) (*Release, error) { var ( release Release contribs = make([]struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` }, len(w.AuthorName)) ) for i, author := range w.AuthorName { contribs[i].RawName = author } release.Title = w.Title if len(w.PublishYear) > 0 { release.ReleaseYearValue = w.FirstPublishYear } // XXX: isbn normalization return nil, nil }