summaryrefslogtreecommitdiffstats
path: root/ncbi-dna.scm
diff options
context:
space:
mode:
Diffstat (limited to 'ncbi-dna.scm')
-rw-r--r--ncbi-dna.scm172
1 files changed, 172 insertions, 0 deletions
diff --git a/ncbi-dna.scm b/ncbi-dna.scm
new file mode 100644
index 0000000..03fff65
--- /dev/null
+++ b/ncbi-dna.scm
@@ -0,0 +1,172 @@
+;;;; "ncbi-dna.scm" Read and manipulate NCBI-format nucleotide sequences
+;;; Copyright (C) 2003 Aubrey Jaffer
+;
+;Permission to copy this software, to modify it, to redistribute it,
+;to distribute modified versions, and to use it for any purpose is
+;granted, subject to the following restrictions and understandings.
+;
+;1. Any copy made of this software must include this copyright notice
+;in full.
+;
+;2. I have made no warranty or representation that the operation of
+;this software will be error-free, and I am under no obligation to
+;provide any services, by way of maintenance, update, or otherwise.
+;
+;3. In conjunction with products arising from the use of this
+;material, there shall be no use of my name in any advertising,
+;promotional, or sales literature without prior written consent in
+;each case.
+
+(require 'array)
+(require 'scanf)
+(require 'string-case)
+(require 'string-search)
+(require 'array-for-each)
+(require-if 'compiling 'printf) ;used by cDNA:report-base-count
+
+(define (ncbi:read-DNA-line port)
+ (define lst (scanf-read-list
+ " %d %[acgt] %[acgt] %[acgt] %[acgt] %[acgt] %[acgt]" port))
+ (cond ((or (null? lst) (eof-object? lst)) #f)
+ ((not (eqv? 1 (modulo (car lst) 60)))
+ (slib:warn 'bad 'idx lst) #f)
+ (else (apply string-append (cdr lst)))))
+
+;;@body
+;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN}
+;;from @1.
+(define (ncbi:read-DNA-sequence port)
+ (find-string-from-port? "ORIGIN" port)
+ (find-string-from-port? (string #\newline) port)
+ (do ((lne (ncbi:read-DNA-line port) (ncbi:read-DNA-line port))
+ (lns '() (cons lne lns)))
+ ((not lne) (apply string-append (reverse lns)))))
+
+;;@body
+;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN}
+;;from @1.
+(define (ncbi:read-file file)
+ (call-with-input-file file ncbi:read-DNA-sequence))
+
+;;@body
+;;Replaces @samp{T} with @samp{U} in @1
+(define (mRNA<-cDNA str)
+ (array-for-each
+ (lambda (chr)
+ (case chr
+ ((#\a) #\a)
+ ((#\t) #\u)
+ ((#\c) #\c)
+ ((#\g) #\g)
+ ((#\A) #\A)
+ ((#\T) #\U)
+ ((#\C) #\C)
+ ((#\G) #\G)
+ (else chr)))
+ str))
+
+(define cDNA:codons
+ '((TTT phe #\F) (TCT ser #\S) (TAT tyr #\Y) (TGT cys #\C)
+ (TTC phe #\F) (TCC ser #\S) (TAC tyr #\Y) (TGC cys #\C)
+ (TTA leu #\L) (TCA ser #\S) (TAA) (TGA) ;stops
+ (TTG leu #\L) (TCG ser #\S) (TAG) (TGG trp #\W)
+ (CTT leu #\L) (CCT pro #\P) (CAT his #\H) (CGT arg #\R)
+ (CTC leu #\L) (CCC pro #\P) (CAC his #\H) (CGC arg #\R)
+ (CTA leu #\L) (CCA pro #\P) (CAA gln #\Q) (CGA arg #\R)
+ (CTG leu #\L) (CCG pro #\P) (CAG gln #\Q) (CGG arg #\R)
+ (ATT ile #\I) (ACT thr #\T) (AAT asn #\N) (AGT ser #\S)
+ (ATC ile #\I) (ACC thr #\T) (AAC asn #\N) (AGC ser #\S)
+ (ATA ile #\I) (ACA thr #\T) (AAA lys #\K) (AGA arg #\R)
+ (ATG met #\M) (ACG thr #\T) (AAG lys #\K) (AGG arg #\R)
+ (GTT val #\V) (GCT ala #\A) (GAT asp #\D) (GGT gly #\G)
+ (GTC val #\V) (GCC ala #\A) (GAC asp #\D) (GGC gly #\G)
+ (GTA val #\V) (GCA ala #\A) (GAA glu #\E) (GGA gly #\G)
+ (GTG val #\V) (GCG ala #\A) (GAG glu #\E) (GGG gly #\G)))
+
+;;@body
+;;Returns a list of three-letter symbol codons comprising the protein
+;;sequence encoded by @1 starting with its first occurence of
+;;@samp{atg}.
+(define (codons<-cDNA cDNA)
+ (define len (string-length cDNA))
+ (define start #f)
+ (set! start (substring-ci? "atg" cDNA))
+ (if (not start) (slib:warn 'missed 'start))
+ (let loop ((protein '(*N*))
+ (cdx (or start 0)))
+ (if (<= len cdx) (slib:error 'reached 'end cdx))
+ (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx)))))
+ (define asc (assq codon cDNA:codons))
+ (cond ((not asc)
+ (slib:warn 'mystery 'codon codon)
+ (reverse (cons '*C* protein)))
+ ((null? (cdr asc)) (reverse (cons '*C* protein)))
+ (else (loop (cons codon protein) (+ 3 cdx)))))))
+
+;;@body
+;;Returns a list of three-letter symbols for the protein sequence
+;;encoded by @1 starting with its first occurence of @samp{atg}.
+(define (protein<-cDNA cDNA)
+ (define len (string-length cDNA))
+ (define start #f)
+ (set! start (substring-ci? "atg" cDNA))
+ (if (not start) (slib:warn 'missed 'start))
+ (let loop ((protein '(*N*))
+ (cdx (or start 0)))
+ (if (<= len cdx) (slib:error 'reached 'end cdx))
+ (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx)))))
+ (define asc (assq codon cDNA:codons))
+ (cond ((not asc)
+ (slib:warn 'mystery 'codon codon)
+ (reverse (cons '*C* protein)))
+ ((null? (cdr asc)) (reverse (cons '*C* protein)))
+ (else (loop (cons (cadr asc) protein) (+ 3 cdx)))))))
+
+;;@body
+;;Returns a string of one-letter amino acid codes for the protein
+;;sequence encoded by @1 starting with its first occurence of
+;;@samp{atg}.
+(define (P<-cDNA cDNA)
+ (define len (string-length cDNA))
+ (define start #f)
+ (set! start (substring-ci? "atg" cDNA))
+ (if (not start) (slib:warn 'missed 'start))
+ (let loop ((protein '())
+ (cdx (or start 0)))
+ (if (<= len cdx) (slib:error 'reached 'end cdx))
+ (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx)))))
+ (define asc (assq codon cDNA:codons))
+ (cond ((not asc) (slib:error 'mystery 'codon codon))
+ ((null? (cdr asc)) (list->string (reverse protein)))
+ (else (loop (cons (caddr asc) protein) (+ 3 cdx)))))))
+
+;;@
+;;These cDNA count routines provide a means to check the nucleotide
+;;sequence with the @samp{BASE COUNT} line preceding the sequence from
+;;NCBI.
+
+;;@body
+;;Returns a list of counts of @samp{a}, @samp{c}, @samp{g}, and
+;;@samp{t} occurrencing in @1.
+(define (cDNA:base-count cDNA)
+ (define cnt:a 0)
+ (define cnt:c 0)
+ (define cnt:g 0)
+ (define cnt:t 0)
+ (array-for-each (lambda (chr)
+ (case chr
+ ((#\a #\A) (set! cnt:a (+ 1 cnt:a)))
+ ((#\c #\C) (set! cnt:c (+ 1 cnt:c)))
+ ((#\g #\G) (set! cnt:g (+ 1 cnt:g)))
+ ((#\t #\T) (set! cnt:t (+ 1 cnt:t)))
+ (else (slib:error 'cDNA:base-count 'unknown 'base chr))))
+ cDNA)
+ (list cnt:a cnt:c cnt:g cnt:t))
+
+;;@body
+;;Prints the counts of @samp{a}, @samp{c}, @samp{g}, and @samp{t}
+;;occurrencing in @1.
+(define (cDNA:report-base-count cDNA)
+ (require 'printf)
+ (apply printf "BASE COUNT %6d a %6d c %6d g %6d t\\n"
+ (cDNA:base-count cDNA)))