diff options
Diffstat (limited to 'ncbi-dna.scm')
-rw-r--r-- | ncbi-dna.scm | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/ncbi-dna.scm b/ncbi-dna.scm new file mode 100644 index 0000000..03fff65 --- /dev/null +++ b/ncbi-dna.scm @@ -0,0 +1,172 @@ +;;;; "ncbi-dna.scm" Read and manipulate NCBI-format nucleotide sequences +;;; Copyright (C) 2003 Aubrey Jaffer +; +;Permission to copy this software, to modify it, to redistribute it, +;to distribute modified versions, and to use it for any purpose is +;granted, subject to the following restrictions and understandings. +; +;1. Any copy made of this software must include this copyright notice +;in full. +; +;2. I have made no warranty or representation that the operation of +;this software will be error-free, and I am under no obligation to +;provide any services, by way of maintenance, update, or otherwise. +; +;3. In conjunction with products arising from the use of this +;material, there shall be no use of my name in any advertising, +;promotional, or sales literature without prior written consent in +;each case. + +(require 'array) +(require 'scanf) +(require 'string-case) +(require 'string-search) +(require 'array-for-each) +(require-if 'compiling 'printf) ;used by cDNA:report-base-count + +(define (ncbi:read-DNA-line port) + (define lst (scanf-read-list + " %d %[acgt] %[acgt] %[acgt] %[acgt] %[acgt] %[acgt]" port)) + (cond ((or (null? lst) (eof-object? lst)) #f) + ((not (eqv? 1 (modulo (car lst) 60))) + (slib:warn 'bad 'idx lst) #f) + (else (apply string-append (cdr lst))))) + +;;@body +;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN} +;;from @1. +(define (ncbi:read-DNA-sequence port) + (find-string-from-port? "ORIGIN" port) + (find-string-from-port? (string #\newline) port) + (do ((lne (ncbi:read-DNA-line port) (ncbi:read-DNA-line port)) + (lns '() (cons lne lns))) + ((not lne) (apply string-append (reverse lns))))) + +;;@body +;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN} +;;from @1. +(define (ncbi:read-file file) + (call-with-input-file file ncbi:read-DNA-sequence)) + +;;@body +;;Replaces @samp{T} with @samp{U} in @1 +(define (mRNA<-cDNA str) + (array-for-each + (lambda (chr) + (case chr + ((#\a) #\a) + ((#\t) #\u) + ((#\c) #\c) + ((#\g) #\g) + ((#\A) #\A) + ((#\T) #\U) + ((#\C) #\C) + ((#\G) #\G) + (else chr))) + str)) + +(define cDNA:codons + '((TTT phe #\F) (TCT ser #\S) (TAT tyr #\Y) (TGT cys #\C) + (TTC phe #\F) (TCC ser #\S) (TAC tyr #\Y) (TGC cys #\C) + (TTA leu #\L) (TCA ser #\S) (TAA) (TGA) ;stops + (TTG leu #\L) (TCG ser #\S) (TAG) (TGG trp #\W) + (CTT leu #\L) (CCT pro #\P) (CAT his #\H) (CGT arg #\R) + (CTC leu #\L) (CCC pro #\P) (CAC his #\H) (CGC arg #\R) + (CTA leu #\L) (CCA pro #\P) (CAA gln #\Q) (CGA arg #\R) + (CTG leu #\L) (CCG pro #\P) (CAG gln #\Q) (CGG arg #\R) + (ATT ile #\I) (ACT thr #\T) (AAT asn #\N) (AGT ser #\S) + (ATC ile #\I) (ACC thr #\T) (AAC asn #\N) (AGC ser #\S) + (ATA ile #\I) (ACA thr #\T) (AAA lys #\K) (AGA arg #\R) + (ATG met #\M) (ACG thr #\T) (AAG lys #\K) (AGG arg #\R) + (GTT val #\V) (GCT ala #\A) (GAT asp #\D) (GGT gly #\G) + (GTC val #\V) (GCC ala #\A) (GAC asp #\D) (GGC gly #\G) + (GTA val #\V) (GCA ala #\A) (GAA glu #\E) (GGA gly #\G) + (GTG val #\V) (GCG ala #\A) (GAG glu #\E) (GGG gly #\G))) + +;;@body +;;Returns a list of three-letter symbol codons comprising the protein +;;sequence encoded by @1 starting with its first occurence of +;;@samp{atg}. +(define (codons<-cDNA cDNA) + (define len (string-length cDNA)) + (define start #f) + (set! start (substring-ci? "atg" cDNA)) + (if (not start) (slib:warn 'missed 'start)) + (let loop ((protein '(*N*)) + (cdx (or start 0))) + (if (<= len cdx) (slib:error 'reached 'end cdx)) + (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx))))) + (define asc (assq codon cDNA:codons)) + (cond ((not asc) + (slib:warn 'mystery 'codon codon) + (reverse (cons '*C* protein))) + ((null? (cdr asc)) (reverse (cons '*C* protein))) + (else (loop (cons codon protein) (+ 3 cdx))))))) + +;;@body +;;Returns a list of three-letter symbols for the protein sequence +;;encoded by @1 starting with its first occurence of @samp{atg}. +(define (protein<-cDNA cDNA) + (define len (string-length cDNA)) + (define start #f) + (set! start (substring-ci? "atg" cDNA)) + (if (not start) (slib:warn 'missed 'start)) + (let loop ((protein '(*N*)) + (cdx (or start 0))) + (if (<= len cdx) (slib:error 'reached 'end cdx)) + (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx))))) + (define asc (assq codon cDNA:codons)) + (cond ((not asc) + (slib:warn 'mystery 'codon codon) + (reverse (cons '*C* protein))) + ((null? (cdr asc)) (reverse (cons '*C* protein))) + (else (loop (cons (cadr asc) protein) (+ 3 cdx))))))) + +;;@body +;;Returns a string of one-letter amino acid codes for the protein +;;sequence encoded by @1 starting with its first occurence of +;;@samp{atg}. +(define (P<-cDNA cDNA) + (define len (string-length cDNA)) + (define start #f) + (set! start (substring-ci? "atg" cDNA)) + (if (not start) (slib:warn 'missed 'start)) + (let loop ((protein '()) + (cdx (or start 0))) + (if (<= len cdx) (slib:error 'reached 'end cdx)) + (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx))))) + (define asc (assq codon cDNA:codons)) + (cond ((not asc) (slib:error 'mystery 'codon codon)) + ((null? (cdr asc)) (list->string (reverse protein))) + (else (loop (cons (caddr asc) protein) (+ 3 cdx))))))) + +;;@ +;;These cDNA count routines provide a means to check the nucleotide +;;sequence with the @samp{BASE COUNT} line preceding the sequence from +;;NCBI. + +;;@body +;;Returns a list of counts of @samp{a}, @samp{c}, @samp{g}, and +;;@samp{t} occurrencing in @1. +(define (cDNA:base-count cDNA) + (define cnt:a 0) + (define cnt:c 0) + (define cnt:g 0) + (define cnt:t 0) + (array-for-each (lambda (chr) + (case chr + ((#\a #\A) (set! cnt:a (+ 1 cnt:a))) + ((#\c #\C) (set! cnt:c (+ 1 cnt:c))) + ((#\g #\G) (set! cnt:g (+ 1 cnt:g))) + ((#\t #\T) (set! cnt:t (+ 1 cnt:t))) + (else (slib:error 'cDNA:base-count 'unknown 'base chr)))) + cDNA) + (list cnt:a cnt:c cnt:g cnt:t)) + +;;@body +;;Prints the counts of @samp{a}, @samp{c}, @samp{g}, and @samp{t} +;;occurrencing in @1. +(define (cDNA:report-base-count cDNA) + (require 'printf) + (apply printf "BASE COUNT %6d a %6d c %6d g %6d t\\n" + (cDNA:base-count cDNA))) |