;;;; "ncbi-dna.scm" Read and manipulate NCBI-format nucleotide sequences ;;; Copyright (C) 2003 Aubrey Jaffer ; ;Permission to copy this software, to modify it, to redistribute it, ;to distribute modified versions, and to use it for any purpose is ;granted, subject to the following restrictions and understandings. ; ;1. Any copy made of this software must include this copyright notice ;in full. ; ;2. I have made no warranty or representation that the operation of ;this software will be error-free, and I am under no obligation to ;provide any services, by way of maintenance, update, or otherwise. ; ;3. In conjunction with products arising from the use of this ;material, there shall be no use of my name in any advertising, ;promotional, or sales literature without prior written consent in ;each case. (require 'array) (require 'scanf) (require 'string-case) (require 'string-search) (require 'array-for-each) (require-if 'compiling 'printf) ;used by cDNA:report-base-count ;;@code{(require 'ncbi-dma)} ;;@ftindex ncbi-dma (define (ncbi:read-DNA-line port) (define lst (scanf-read-list " %d %[acgt] %[acgt] %[acgt] %[acgt] %[acgt] %[acgt]" port)) (cond ((or (null? lst) (eof-object? lst)) #f) ((not (eqv? 1 (modulo (car lst) 60))) (slib:warn 'bad 'idx lst) #f) (else (apply string-append (cdr lst))))) ;;@body ;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN} ;;from @1. (define (ncbi:read-DNA-sequence port) (find-string-from-port? "ORIGIN" port) (find-string-from-port? (string #\newline) port) (do ((lne (ncbi:read-DNA-line port) (ncbi:read-DNA-line port)) (lns '() (cons lne lns))) ((not lne) (apply string-append (reverse lns))))) ;;@body ;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN} ;;from @1. (define (ncbi:read-file file) (call-with-input-file file ncbi:read-DNA-sequence)) ;;@body ;;Replaces @samp{T} with @samp{U} in @1 (define (mRNA<-cDNA str) (array-for-each (lambda (chr) (case chr ((#\a) #\a) ((#\t) #\u) ((#\c) #\c) ((#\g) #\g) ((#\A) #\A) ((#\T) #\U) ((#\C) #\C) ((#\G) #\G) (else chr))) str)) (define cDNA:codons '((TTT phe #\F) (TCT ser #\S) (TAT tyr #\Y) (TGT cys #\C) (TTC phe #\F) (TCC ser #\S) (TAC tyr #\Y) (TGC cys #\C) (TTA leu #\L) (TCA ser #\S) (TAA) (TGA) ;stops (TTG leu #\L) (TCG ser #\S) (TAG) (TGG trp #\W) (CTT leu #\L) (CCT pro #\P) (CAT his #\H) (CGT arg #\R) (CTC leu #\L) (CCC pro #\P) (CAC his #\H) (CGC arg #\R) (CTA leu #\L) (CCA pro #\P) (CAA gln #\Q) (CGA arg #\R) (CTG leu #\L) (CCG pro #\P) (CAG gln #\Q) (CGG arg #\R) (ATT ile #\I) (ACT thr #\T) (AAT asn #\N) (AGT ser #\S) (ATC ile #\I) (ACC thr #\T) (AAC asn #\N) (AGC ser #\S) (ATA ile #\I) (ACA thr #\T) (AAA lys #\K) (AGA arg #\R) (ATG met #\M) (ACG thr #\T) (AAG lys #\K) (AGG arg #\R) (GTT val #\V) (GCT ala #\A) (GAT asp #\D) (GGT gly #\G) (GTC val #\V) (GCC ala #\A) (GAC asp #\D) (GGC gly #\G) (GTA val #\V) (GCA ala #\A) (GAA glu #\E) (GGA gly #\G) (GTG val #\V) (GCG ala #\A) (GAG glu #\E) (GGG gly #\G))) ;;@body ;;Returns a list of three-letter symbol codons comprising the protein ;;sequence encoded by @1 starting with its first occurence of ;;@samp{atg}. (define (codons<-cDNA cDNA) (define len (string-length cDNA)) (define start #f) (set! start (substring-ci? "atg" cDNA)) (if (not start) (slib:warn 'missed 'start)) (let loop ((protein '(*N*)) (cdx (or start 0))) (if (<= len cdx) (slib:error 'reached 'end cdx)) (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx))))) (define asc (assq codon cDNA:codons)) (cond ((not asc) (slib:warn 'mystery 'codon codon) (reverse (cons '*C* protein))) ((null? (cdr asc)) (reverse (cons '*C* protein))) (else (loop (cons codon protein) (+ 3 cdx))))))) ;;@body ;;Returns a list of three-letter symbols for the protein sequence ;;encoded by @1 starting with its first occurence of @samp{atg}. (define (protein<-cDNA cDNA) (define len (string-length cDNA)) (define start #f) (set! start (substring-ci? "atg" cDNA)) (if (not start) (slib:warn 'missed 'start)) (let loop ((protein '(*N*)) (cdx (or start 0))) (if (<= len cdx) (slib:error 'reached 'end cdx)) (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx))))) (define asc (assq codon cDNA:codons)) (cond ((not asc) (slib:warn 'mystery 'codon codon) (reverse (cons '*C* protein))) ((null? (cdr asc)) (reverse (cons '*C* protein))) (else (loop (cons (cadr asc) protein) (+ 3 cdx))))))) ;;@body ;;Returns a string of one-letter amino acid codes for the protein ;;sequence encoded by @1 starting with its first occurence of ;;@samp{atg}. (define (P<-cDNA cDNA) (define len (string-length cDNA)) (define start #f) (set! start (substring-ci? "atg" cDNA)) (if (not start) (slib:warn 'missed 'start)) (let loop ((protein '()) (cdx (or start 0))) (if (<= len cdx) (slib:error 'reached 'end cdx)) (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx))))) (define asc (assq codon cDNA:codons)) (cond ((not asc) (slib:error 'mystery 'codon codon)) ((null? (cdr asc)) (list->string (reverse protein))) (else (loop (cons (caddr asc) protein) (+ 3 cdx))))))) ;;@ ;;These cDNA count routines provide a means to check the nucleotide ;;sequence with the @samp{BASE COUNT} line preceding the sequence from ;;NCBI. ;;@body ;;Returns a list of counts of @samp{a}, @samp{c}, @samp{g}, and ;;@samp{t} occurrencing in @1. (define (cDNA:base-count cDNA) (define cnt:a 0) (define cnt:c 0) (define cnt:g 0) (define cnt:t 0) (array-for-each (lambda (chr) (case chr ((#\a #\A) (set! cnt:a (+ 1 cnt:a))) ((#\c #\C) (set! cnt:c (+ 1 cnt:c))) ((#\g #\G) (set! cnt:g (+ 1 cnt:g))) ((#\t #\T) (set! cnt:t (+ 1 cnt:t))) (else (slib:error 'cDNA:base-count 'unknown 'base chr)))) cDNA) (list cnt:a cnt:c cnt:g cnt:t)) ;;@body ;;Prints the counts of @samp{a}, @samp{c}, @samp{g}, and @samp{t} ;;occurrencing in @1. (define (cDNA:report-base-count cDNA) (require 'printf) (apply printf "BASE COUNT %6d a %6d c %6d g %6d t\\n" (cDNA:base-count cDNA)))