summaryrefslogtreecommitdiffstats
path: root/ncbi-dna.scm
blob: d5fc5caa668bafc36735b2e43578f8f802f788d4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
;;;; "ncbi-dna.scm" Read and manipulate NCBI-format nucleotide sequences
;;; Copyright (C) 2003 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1.  Any copy made of this software must include this copyright notice
;in full.
;
;2.  I have made no warranty or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3.  In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.

(require 'array)
(require 'scanf)
(require 'string-case)
(require 'string-search)
(require 'array-for-each)
(require-if 'compiling 'printf)	;used by cDNA:report-base-count

;;@code{(require 'ncbi-dma)}
;;@ftindex ncbi-dma

(define (ncbi:read-DNA-line port)
  (define lst (scanf-read-list
	       " %d %[acgt] %[acgt] %[acgt] %[acgt] %[acgt] %[acgt]" port))
  (cond ((or (null? lst) (eof-object? lst)) #f)
	((not (eqv? 1 (modulo (car lst) 60)))
	 (slib:warn 'bad 'idx lst) #f)
	(else (apply string-append (cdr lst)))))

;;@body
;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN}
;;from @1.
(define (ncbi:read-DNA-sequence port)
  (find-string-from-port? "ORIGIN" port)
  (find-string-from-port? (string #\newline) port)
  (do ((lne (ncbi:read-DNA-line port) (ncbi:read-DNA-line port))
       (lns '() (cons lne lns)))
      ((not lne) (apply string-append (reverse lns)))))

;;@body
;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN}
;;from @1.
(define (ncbi:read-file file)
  (call-with-input-file file ncbi:read-DNA-sequence))

;;@body
;;Replaces @samp{T} with @samp{U} in @1
(define (mRNA<-cDNA str)
  (array-for-each
   (lambda (chr)
     (case chr
       ((#\a) #\a)
       ((#\t) #\u)
       ((#\c) #\c)
       ((#\g) #\g)
       ((#\A) #\A)
       ((#\T) #\U)
       ((#\C) #\C)
       ((#\G) #\G)
       (else chr)))
   str))

(define cDNA:codons
  '((TTT phe #\F) (TCT ser #\S) (TAT tyr #\Y) (TGT cys #\C)
    (TTC phe #\F) (TCC ser #\S) (TAC tyr #\Y) (TGC cys #\C)
    (TTA leu #\L) (TCA ser #\S) (TAA)         (TGA) ;stops
    (TTG leu #\L) (TCG ser #\S) (TAG)         (TGG trp #\W)
    (CTT leu #\L) (CCT pro #\P) (CAT his #\H) (CGT arg #\R)
    (CTC leu #\L) (CCC pro #\P) (CAC his #\H) (CGC arg #\R)
    (CTA leu #\L) (CCA pro #\P) (CAA gln #\Q) (CGA arg #\R)
    (CTG leu #\L) (CCG pro #\P) (CAG gln #\Q) (CGG arg #\R)
    (ATT ile #\I) (ACT thr #\T) (AAT asn #\N) (AGT ser #\S)
    (ATC ile #\I) (ACC thr #\T) (AAC asn #\N) (AGC ser #\S)
    (ATA ile #\I) (ACA thr #\T) (AAA lys #\K) (AGA arg #\R)
    (ATG met #\M) (ACG thr #\T) (AAG lys #\K) (AGG arg #\R)
    (GTT val #\V) (GCT ala #\A) (GAT asp #\D) (GGT gly #\G)
    (GTC val #\V) (GCC ala #\A) (GAC asp #\D) (GGC gly #\G)
    (GTA val #\V) (GCA ala #\A) (GAA glu #\E) (GGA gly #\G)
    (GTG val #\V) (GCG ala #\A) (GAG glu #\E) (GGG gly #\G)))

;;@body
;;Returns a list of three-letter symbol codons comprising the protein
;;sequence encoded by @1 starting with its first occurence of
;;@samp{atg}.
(define (codons<-cDNA cDNA)
  (define len (string-length cDNA))
  (define start #f)
  (set! start (substring-ci? "atg" cDNA))
  (if (not start) (slib:warn 'missed 'start))
  (let loop ((protein '(*N*))
	     (cdx (or start 0)))
    (if (<= len cdx) (slib:error 'reached 'end cdx))
    (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx)))))
      (define asc (assq codon cDNA:codons))
      (cond ((not asc)
	     (slib:warn 'mystery 'codon codon)
	     (reverse (cons '*C* protein)))
	    ((null? (cdr asc)) (reverse (cons '*C* protein)))
	    (else (loop (cons codon protein) (+ 3 cdx)))))))

;;@body
;;Returns a list of three-letter symbols for the protein sequence
;;encoded by @1 starting with its first occurence of @samp{atg}.
(define (protein<-cDNA cDNA)
  (define len (string-length cDNA))
  (define start #f)
  (set! start (substring-ci? "atg" cDNA))
  (if (not start) (slib:warn 'missed 'start))
  (let loop ((protein '(*N*))
	     (cdx (or start 0)))
    (if (<= len cdx) (slib:error 'reached 'end cdx))
    (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx)))))
      (define asc (assq codon cDNA:codons))
      (cond ((not asc)
	     (slib:warn 'mystery 'codon codon)
	     (reverse (cons '*C* protein)))
	    ((null? (cdr asc)) (reverse (cons '*C* protein)))
	    (else (loop (cons (cadr asc) protein) (+ 3 cdx)))))))

;;@body
;;Returns a string of one-letter amino acid codes for the protein
;;sequence encoded by @1 starting with its first occurence of
;;@samp{atg}.
(define (P<-cDNA cDNA)
  (define len (string-length cDNA))
  (define start #f)
  (set! start (substring-ci? "atg" cDNA))
  (if (not start) (slib:warn 'missed 'start))
  (let loop ((protein '())
	     (cdx (or start 0)))
    (if (<= len cdx) (slib:error 'reached 'end cdx))
    (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx)))))
      (define asc (assq codon cDNA:codons))
      (cond ((not asc) (slib:error 'mystery 'codon codon))
	    ((null? (cdr asc)) (list->string (reverse protein)))
	    (else (loop (cons (caddr asc) protein) (+ 3 cdx)))))))

;;@
;;These cDNA count routines provide a means to check the nucleotide
;;sequence with the @samp{BASE COUNT} line preceding the sequence from
;;NCBI.

;;@body
;;Returns a list of counts of @samp{a}, @samp{c}, @samp{g}, and
;;@samp{t} occurrencing in @1.
(define (cDNA:base-count cDNA)
  (define cnt:a 0)
  (define cnt:c 0)
  (define cnt:g 0)
  (define cnt:t 0)
  (array-for-each (lambda (chr)
		    (case chr
		      ((#\a #\A) (set! cnt:a (+ 1 cnt:a)))
		      ((#\c #\C) (set! cnt:c (+ 1 cnt:c)))
		      ((#\g #\G) (set! cnt:g (+ 1 cnt:g)))
		      ((#\t #\T) (set! cnt:t (+ 1 cnt:t)))
		      (else (slib:error 'cDNA:base-count 'unknown 'base chr))))
		  cDNA)
  (list cnt:a cnt:c cnt:g cnt:t))

;;@body
;;Prints the counts of @samp{a}, @samp{c}, @samp{g}, and @samp{t}
;;occurrencing in @1.
(define (cDNA:report-base-count cDNA)
  (require 'printf)
  (apply printf "BASE COUNT   %6d a %6d c %6d g %6d t\\n"
	 (cDNA:base-count cDNA)))