;; bio-mode.el --- major mode for manipulating biological sequences
;; Copyright (C) 1997 Thomas Sicheritz
;;                                  All rights reserved.
;; Last changed: Time-stamp: <98/06/15 08:26:09 thomas>
;;
;; Authors: Thomas Sicheritz <Thomas.Sicheritz@molbio.uu.se>
;; 
;; Created: January 1997
;; Version:         1.002
;; Keywords: bio molecular biology sequences
;;
;;

;; This file is not part of any Emacs

;;{{{ GPL
;;    This program is free software; you can redistribute it and/or modify
;;    it under the terms of the GNU General Public License as published by
;;    the Free Software Foundation; either version 2 of the License, or
;;    (at your option) any later version.
;;
;;    This program is distributed in the hope that it will be useful,
;;    but WITHOUT ANY WARRANTY; without even the implied warranty of
;;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;    GNU General Public License for more details.
;;
;;    You should have received a copy of the GNU General Public License
;;    along with this program; if not, write to the Free Software
;;    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
;;}}}
;;{{{ Installation

;;  put 
;; (autoload 'bio-mode "bio-mode" 
;;  "Molecular Biology Mode" t)
;;  (setq auto-mode-alist
;;        (append '(("\\.embl$" . bio-mode)
;;                  ("\\.seq$" . bio-mode)
;;                  ("\\.fas$" . bio-mode)
;;                  ("\\.phy$" . bio-mode))
;;                auto-mode-alist))
;;  in your .emacs

;;}}}
;;{{{ Todo

;;find ORF longer than # nt (if !argc return longest)
;;Codon usage

;;}}}
;;{{{ Usage

;; All commands working on marked regions will first discard any
;; non-valid sequence characters. e.g. In an EMBL file the leading
;; "FT       " will not appear in the sequence - the same for integers,
;; spaces newlines ...
;; Translation:
;; 	Mark a region and invoke M-x bio-translate-region.
;; Blast:
;; 	Mark a region and invoke M-x bio-blast-region.
;; G+C content:
;; 	Mark a region and invoke M-x bio-gccontent-on-region
;; Converting EMBL to Gene Table Format (GTF):
;; 	Mark a region and invoke M-x bio-convert-embl2-gtf
;; Extracting all CDS with amino-acid sequence from EMBL
;; 	Mark a region and invoke M-x bio-embl-extract
;; 
;; Readseq, wrapper functions for using readseq with bio-mode
;;      Mark a region and invoke M-x bio-writeseq
;;      Invoke M-x bio-readseq for reading sequence files

;;}}}

;;;Code

;;{{{ Compatibility functions

;; Needs some Common Lisp functions
;;(require 'cl)
;; Check if replace-in-string is defined 
(cond ((null (fboundp 'replace-in-string))
       (defun replace-in-string (str regexp newtext &optional literal)
;;{{{ 

	 "Replaces all matches in STR for REGEXP with NEWTEXT string.
Optional LITERAL non-nil means do a literal replacement.
Otherwise treat \\ in NEWTEXT string as special:
  \\& means substitute original matched text,
  \\N means substitute match for \(...\) number N,
  \\\\ means insert one \\."
	 (if (not (stringp str))
	     (error "(replace-in-string): First argument must be a string: %s" str))
	 (if (stringp newtext)
	     nil
	   (error "(replace-in-string): 3rd arg must be a string: %s"
		  newtext))
	 (let ((rtn-str "")
	       (start 0)
	       (special)
	       match prev-start)
	   (while (setq match (string-match regexp str start))
	     (setq prev-start start
		   start (match-end 0)
		   rtn-str
		   (concat
		    rtn-str
		    (substring str prev-start match)
		    (cond (literal newtext)
			  (t (mapconcat
			      (function
			       (lambda (c)
				 (if special
				     (progn
				       (setq special nil)
				       (cond ((eq c ?\\) "\\")
					     ((eq c ?&)
					      (substring str
							 (match-beginning 0)
							 (match-end 0)))
					     ((and (>= c ?0) (<= c ?9))
					      (if (> c (+ ?0 (length
							      (match-data))))
						  ;; Invalid match num
						  (error "(replace-in-string) Invalid match num: %c" c)
						(setq c (- c ?0))
						(substring str
							   (match-beginning c)
							   (match-end c))))
					     (t (char-to-string c))))
				   (if (eq c ?\\) (progn (setq special t) nil)
				     (char-to-string c)))))
			      newtext ""))))))
	   (concat rtn-str (substring str start))))
       ))

;;}}}

;; check makehash-table, make alias if necessary (which needs Common Lisp)
(cond ((null (fboundp 'make-hash-table))
       (require 'cl)))
;(cond ((null (fboundp 'makehash-table))
;       (defalias 'makehash-table 'make-hash-table)))
;(fboundp 'makehash-table)

(cond ((null (fboundp 'puthash))
       (defun puthash (key value table)
	 (setf (gethash key table) value))))

;; processes
(cond ((null (fboundp 'exec-to-string))
       (defmacro with-output-to-string (&rest forms)
	 "Collect output to `standard-output' while evaluating FORMS and return
it as a string."
	 ;; by "William G. Dubuque" <wgd@zurich.ai.mit.edu> w/ mods from Stig
	 (` (save-excursion
	      (set-buffer (get-buffer-create " *string-output*"))
	      (setq buffer-read-only nil)
	      (buffer-disable-undo (current-buffer))
	      (erase-buffer)
	      (let ((standard-output (current-buffer)))
		(,@ forms))
	      (prog1
		  (buffer-string)
		(erase-buffer)))))
       (defun exec-to-string (command)
	 "Execute COMMAND as an external process and return the output of that
process as a string"
	 ;; by "William G. Dubuque" <wgd@zurich.ai.mit.edu>
	 (with-output-to-string
	  (call-process shell-file-name nil t nil "-c" command)))
       ))

(if (not (fboundp 'char-to-int))
    (defun char-to-int (c)
      c)
  )



;;}}}
;;{{{ Global variables and Macros

;; some Macros borrowed from Jari Aalto's "Jari's tiny tool package"
;; ftp://cs.uta.fi/pub/ssjaaa/tiny-docs.html
(defsubst xemacs-p ()
  "Tests if we're in XEmacs."
  (string-match "XEmacs" emacs-version))

(defsubst emacs-p ()
  "Tests if we're in Emacs."
  (not (boundp 'xemacs-logo)))

;; the regex for grabbing an entry from an EMBL file
(defconst bio-embl-entry "^FT   [Crt][DR].*\\(\nFT     .*\\)+")
;; hashtable for genetic code
(setf bio-gencode-table (make-hash-table 64))

;;}}}
;;{{{ Functions

;; String functions
(defun bio-insert-string (s &optional b)
  "Like the XEmacs insert-string function, inserts string s
into buffer b"
  (cond ((null (stringp b))
	 (setq b (current-buffer))))
  (save-excursion
    (let* ((from-buffer (current-buffer))
	   (to-buffer b)
	   )
      (progn
	(pop-to-buffer to-buffer)
	(insert-string s)
	(pop-to-buffer from-buffer))
      )))

;;}}}
;;{{{ Converting

;; Converting Embl to GTF
(defun bio-gene-name-lookup (name)
  ;; try to grab the gene name out of the EMBL/GB naming jungle
  (cond 
   ((string-match "t[a-z ]*RNA[-\ ]*[Aa]la" name) "trnA" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Aa]la" name) "trnA" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Aa]rg" name) "trnR" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Aa]sn" name) "trnN" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Aa]sp" name) "trnD" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Cc]ys" name) "trnC" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Gg]ln" name) "trnQ" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Gg]lu" name) "trnE" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Gg]ly" name) "trnG" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Hh]is" name) "trnH" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Ii]le" name) "trnI" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Ll]eu" name) "trnL" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Ll]ys" name) "trnK" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Mm]et" name) "trnM" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Pp]he" name) "trnF" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Pp]ro" name) "trnP" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Ss]er" name) "trnS" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Tt]hr" name) "trnT" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Tt]rp" name) "trnW" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Tt]yr" name) "trnY" )
   ((string-match "t[a-z ]*RNA[-\ ]*[Vv]al" name) "trnV" )
   ((string-match "t[a-z ]*RNA[-\ ]*fMet" name) "trnfM" )

   ((string-match "[Aa]la[-\ ]*tRNA" name) "trnA" )
   ((string-match "[Aa]rg[-\ ]*tRNA" name) "trnR" )
   ((string-match "[Aa]sn[-\ ]*tRNA" name) "trnN" )
   ((string-match "[Aa]sp[-\ ]*tRNA" name) "trnD" )
   ((string-match "[Cc]ys[-\ ]*tRNA" name) "trnC" )
   ((string-match "[Gg]ln[-\ ]*tRNA" name) "trnQ" )
   ((string-match "[Gg]lu[-\ ]*tRNA" name) "trnE" )
   ((string-match "[Gg]ly[-\ ]*tRNA" name) "trnG" )
   ((string-match "[Hh]is[-\ ]*tRNA" name) "trnH" )
   ((string-match "[Ii]le[-\ ]*tRNA" name) "trnI" )
   ((string-match "[Ll]eu[-\ ]*tRNA" name) "trnL" )
   ((string-match "[Ll]ys[-\ ]*tRNA" name) "trnK" )
   ((string-match "[Mm]et[-\ ]*tRNA" name) "trnM" )
   ((string-match "[Pp]he[-\ ]*tRNA" name) "trnF" )
   ((string-match "[Pp]ro[-\ ]*tRNA" name) "trnP" )
   ((string-match "[Ss]er[-\ ]*tRNA" name) "trnS" )
   ((string-match "[Ss]er[-\ ]*tRNA" name) "trnS" )
   ((string-match "[Ss]er[-\ ]*tRNA" name) "trnS" )
   ((string-match "[Tt]hr[-\ ]*tRNA" name) "trnT" )
   ((string-match "[Tt]rp[-\ ]*tRNA" name) "trnW" )
   ((string-match "[Tt]yr[-\ ]*tRNA" name) "trnY" )
   ((string-match "[vV]al[-\ ]*tRNA" name) "trnY" )

   ((string-match "^\\(rrn[A-Z]\\) \\([0-9]+[sS]\\) *" name) 
    (concat (match-string 1 name) (match-string 2 name)))

   ((string-match "^rrn\\([A-Z]\\) *" name) 
    (substring name (match-beginning 0) (match-end 0)))

   ((string-match "cytochrome\ *oxidase[,\ ]*subunit\ *1" name) "cox1")
   ((string-match "cytochrome\ *oxidase[,\ ]*subunit\ *2" name) "cox2")
   ((string-match "cytochrome\ *oxidase[,\ ]*subunit\ *3" name) "cox3")
   ((string-match "cytochrome\ *oxidase[,\ ]*subunit\ *4" name) "cox4")
   ((string-match "cytochrome b" name) "cytB" )
   ((string-match "cytochrome c" name) "cytC" )
   ((string-match "ribosomal\ *protein\ *[lL]" name) "rpl" )
   ((string-match "ribosomal\ *protein\ *[sS]" name) "rps" )

   ((string-match "12[sS].*" name) "rrn12" )
   ((string-match "4\.5[sS]*\ *ribosomal\ *RNA" name) "rrn4.5" )
   ((string-match "4\.5[sS]*\ *rRNA" name) "rrn4.5" )
   ((string-match "23[sS]\ *ribosomal\ *RNA" name) "rrnl" )
   ((string-match "23[sS]\ *rRNA" name) "rrnl" )
   ((string-match "large\ *ribosomal\ *RNA" name) "rrnl" )
   ((string-match "large\ *rRNA" name) "rrnl" )
   ((string-match "small\ *ribosomal\ *RNA" name) "rrns" )
   ((string-match "small\ *rRNA" name) "rrns" )
   ((replace-in-string name "[ \t]+" "_"))
   )
  )
(defun bio-embl-entry-gtf (entry out)
  ;; Converting an entry from a EMBL file to Gene Table Format (GTF)
  (let* (
	 (gene-name "no_gene")
	 (gene-real-name "xxx")
	 (note-name "no_note")
	 (product-name "no_product")
	 (standard-name "no_standard")
	 )
    (progn
      (if (and (string-match "gene=" entry)
	       (string-match "gene=\"\\(.*\\)\"" entry))
	  (setq gene-name (substring entry (match-beginning 1)
				     (match-end 1))))
    
      (if (and (string-match "note=" entry)
	       (string-match "note=\"\\([0-9a-zA-Z ]+\\)[;\"]" entry))
	  (setq note-name (substring entry (match-beginning 1)
				     (match-end 1))))
    
      (if (and (string-match "product=" entry)
	       (string-match "product=\"\\(.*\\)\"" entry))
	  (setq product-name (substring entry (match-beginning 1)
					(match-end 1))))
	 
      (if (and (string-match "standard_name=" entry)
	       (string-match "standard_name=\"\\(.*\\)\"" entry))
	  (setq standard-name (substring entry (match-beginning 1)
					 (match-end 1))))
	 
      (if (string-match "complement" entry)
	  (setq gene-direction "R")
	(setq gene-direction "F"))
      (cond 
       ((or (string-match ".." entry) (string-match ". ."))
	(string-match "\\([0-9]+\\)\. *\.\\([0-9]+\\)" entry)
	(setq gene-start (string-to-int (substring entry (match-beginning 1)
						   (match-end 1))))
	(setq gene-stop (string-to-int (substring entry (match-beginning 2)
						  (match-end 2))))))

      (cond 
       ((string-match "tRNA" entry) (setq gene-type "tRNA"))
       ((string-match "rRNA" entry) (setq gene-type "rRNA"))
       ((setq gene-type "CSD"))
       )
      (setq gene-length (- gene-stop gene-start))
       
      (cond 
       ((not (string-match "no_gene" gene-name))
	(setq gene-real-name (bio-gene-name-lookup gene-name)))
       ((not (string-match "no_standard" standard-name))
	(setq gene-real-name (bio-gene-name-lookup standard-name)))
       ((and 
	 (not (string-match "no_product" product-name))
	 (not (string-match "ypothetical.*rotein" product-name))
	 (not (string-match "omolog" product-name)))
	(setq gene-real-name (bio-gene-name-lookup product-name)))
       ((not (string-match "no_note" note-name))
	(setq gene-real-name (bio-gene-name-lookup note-name))))
   

      (bio-insert-string (concat gene-real-name "\t")  out)
      (bio-insert-string (concat gene-type "\t") out)
      (bio-insert-string (concat gene-direction "\t") out)
      (bio-insert-string (concat (int-to-string gene-start) "\t") out)
      (bio-insert-string (concat (int-to-string gene-stop) "\t") out)
      (bio-insert-string (concat (int-to-string gene-length)  "\t\n") out)
      )
    )
  )

(defun bio-convert-prepare (beginning end)
  ;; I dont remeber what that was for ...
  (interactive "r")
  (let* (
	 (rentry-counter 0))
    (while (and (< (point) end)
		(re-search-forward "\\(note=\".*[^\"]\\)\nFT[\t ]+" nil t))
      (setq rentry-counter (+ 1 rentry-counter))    
      (message " replacing[%d]" rentry-counter)
      (replace-match "\\1 " nil nil))
    )
  )

(defun bio-convert-embl2-gtf (beginning end)
  "Convert EMBL file Gene Table Format (GTF)"
  (interactive "r")
  (message "Converting embl->gtf")
  (let* (	 (buffer-name (generate-new-buffer-name "*embl2gtf*"))
		 (entry-counter 0)
		 (gtf-organism "no_id"))
    (progn 
      (get-buffer-create buffer-name)
      (while (and (< (point) end) (string= gtf-organism "no_id")
		  (re-search-forward "^DE +\\(.*\\)$" nil t))
	(setq gtf-organism (match-string 1)))
     
      (while (and (< (point) end)
		  (re-search-forward bio-embl-entry nil t))
	(bio-embl-entry-gtf (match-string 0) buffer-name)
	(setq entry-counter (+ 1 entry-counter))
	(message "Getting entry[%d]" entry-counter)
	)
      (pop-to-buffer buffer-name)  
      (sort-numeric-fields 4 (point-min) (point-max))
      (goto-char (point-min))
      (insert-string (concat "#GTF\nOrganism: "  gtf-organism "\n"))
      (insert-string "Type: \nContigs: 0\n")
      (insert-string "\n\ndefinition of format:\n\nname\ttype\torient\tstart\tstop\tlength\n")
      (message "Ready, found %d entries." entry-counter)
      )
    )
  )

(defun bio-embl-entry-extract (entry out)
  ;; Extracts the translation part
  (let* (
	 (translation-name "no_translation")
	 (gene-name "no_gene")
	 (gene-real-name "xxx")
	 (note-name "no_note")
	 (product-name "no_product")
	 (standard-name "no_standard"))
    (progn

      (if (and (string-match "translation=" entry)
	       (string-match "translation=\"\\([A-Z \n]+\\)\"" entry))
	  (setq translation-name-tmp 
		(replace-in-string (substring entry (match-beginning 1)
					      (match-end 1)) "FT[ \t]+" "")))

      (if (and (string-match "gene=" entry)
	       (string-match "gene=\"\\(.*\\)\"" entry))
	  (setq gene-name (substring entry (match-beginning 1)
				     (match-end 1))))
    
      (if (and (string-match "note=" entry)
	       (string-match "note=\"\\([0-9a-zA-Z ]+\\)[;\"]" entry))
	  (setq note-name (substring entry (match-beginning 1)
				     (match-end 1))))
    
      (if (and (string-match "product=" entry)
	       (string-match "product=\"\\(.*\\)\"" entry))
	  (setq product-name (substring entry (match-beginning 1)
					(match-end 1))))
	 
      (if (and (string-match "standard_name=" entry)
	       (string-match "standard_name=\"\\(.*\\)\"" entry))
	  (setq standard-name (substring entry (match-beginning 1)
					 (match-end 1))))
      
      (cond 
       ((not (string-match "no_gene" gene-name))
	(setq gene-real-name (bio-gene-name-lookup gene-name)))
       ((not (string-match "no_standard" standard-name))
	(setq gene-real-name (bio-gene-name-lookup standard-name)))
       ((and 
	 (not (string-match "no_product" product-name))
	 (not (string-match "ypothetical.*rotein" product-name))
	 (not (string-match "omolog" product-name)))
	(setq gene-real-name (bio-gene-name-lookup product-name)))
       ((not (string-match "no_note" note-name))
	(setq gene-real-name (bio-gene-name-lookup note-name))))

	 
      (if (string-match "complement" entry)
	  (setq gene-direction "R")
	(setq gene-direction "F"))
      
      (cond 
       ((or (string-match ".." entry) (string-match ". ."))
	(string-match "\\([0-9]+\\)\. *\.\\([0-9]+\\)" entry)
	(setq gene-start (string-to-int (substring entry (match-beginning 1)
						   (match-end 1))))
	(setq gene-stop (string-to-int (substring entry (match-beginning 2)
						  (match-end 2))))))
      
      (bio-insert-string (concat ">" gene-real-name ", ") out)
      (bio-insert-string (concat "pos: " gene-start  " " gene-direction "\n" ) out)
      (bio-insert-string translation-name-tmp out)
      (bio-insert-string "\n" out)
      )
    )
  )
(defun bio-embl-extract (beginning end)
  "Extracting all Amino Acid sequences from an EMBL file 
  to multiple FASTA format"
  (interactive "r")
  (message "Extracting all Amino Acid sequences")
  (setq buffer-name (generate-new-buffer-name "*embl-extract*"))
  (get-buffer-create buffer-name)
  (setq entry-counter 0)

  (while (and (< (point) end)
	      (re-search-forward bio-embl-entry nil t))
    (setq embl-current-entry (match-string 0))
    (cond ((string-match "translation=\"\\([A-Z \n]+\\)\"" embl-current-entry)
	   (bio-embl-entry-extract embl-current-entry buffer-name)
	   (setq entry-counter (+ 1 entry-counter))
	   (message "Getting entry[%d]" entry-counter)
	   )
	  ))
  (pop-to-buffer buffer-name)  
  (message "Ready, found %d entries." entry-counter)
  )

;;}}}
;;{{{ Blast

(defun bio-blast-on-region (beginning end)
  "Grabs the marked region, discards all non valid characters and strings
\(FT     , newlines, tabs, spaces and digits) and sends the remaining 
sequence to blast"
  (interactive "r")
  (setq bio-current-seq (bio-region-to-seq 
			 (buffer-substring beginning end)))
  (setq bio-seq-file (buffer-name))
  (if (string= "nt" (bio-check-seq-type 
		     (substring bio-current-seq 0 
				(cond 
				 ((>= (length bio-current-seq) 10) 10)
				 ((length seq))))))
      (setq bio-prompt "blastn embl")
    (setq bio-prompt "blastp swiss"))
  
  (setq bio-prompt (read-string "Run blast like this: " bio-prompt))
  (setq buffer-name (generate-new-buffer-name "*blast*"))
  (get-buffer-create buffer-name)
  (setq bio-temp-file (make-temp-name (concat "/tmp/" "emacs-blast")))
  (pop-to-buffer buffer-name)
  (insert-string (concat "> Emacs bio-mode running blast from " bio-seq-file "\n") ) 
  (bio-pretty-output bio-current-seq buffer-name 50)
  (insert-string "\n\n")
  (append-to-file (point-min) (point-max) bio-temp-file) 
  (setq bio-command (concat bio-prompt " " bio-temp-file))
  (message bio-command)
  (setq bio-blastp-res (exec-to-string bio-command))
  (insert-string bio-blastp-res)
  (delete-file bio-temp-file)
  (message (concat "Emacs " bio-prompt " Ready !!!"))
  (goto-char (point-min))
  )

;;}}}
;;{{{ Search Oligo

(defun bio-change-oligo (oligo)
  ;; changes the olgi to search into an Emacs regex, in order
  ;; to ignore space, tabs, newlines integers ...
  (setq oligo (replace-in-string  oligo "[aA]" "[aA][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[cC]" "[cC][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[gG]" "[gG][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[tT]" "[tT][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[rR]" "[rRaAgG][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[yY]" "[yYcCtT][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[mM]" "[mMaAcC][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[kK]" "[kKgGtT][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[sS]" "[sScCgG][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[wW]" "[wWaAtT][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[bB]" "[bBcCgGtT][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[dD]" "[dDaAgGtT][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[hH]" "[hHaAcCtT][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[vV]" "[vVaAcCgG][0-9 \\\t\\\n]*"))
  (setq oligo (replace-in-string  oligo "[nN]" "[nNxXaAcCgGtT][0-9 \\\t\\\n]*"))
  )
(defun bio-search-forward (p oligo)
  "Search forward from point for oligo.
Ignores on sequence specific characters like
newline, tab, space, integers.."
  ;; uses function from the isearch package 
  ;; for highlighing
  (isearch-dehighlight t)
  (interactive "p\nsSearch for oligo: ")
  (setq oligo (bio-change-oligo oligo))
  (re-search-forward oligo nil t)
  (isearch-make-extent (match-beginning 0) (match-end 0))
  ) 

;;}}}

;;{{{ GC-content, Sequence Information

(defun bio-gccontent-on-region (beginning end)

  "Calculates the G+C content in the marked region
\(after discarding of all non sequence valid characters and strings\)"
  (interactive "r")
  (setq bio-current-seq (bio-region-to-seq (buffer-substring beginning end)))
  (message (concat "G+C = " (number-to-string 
			     (/ (* 100.0 (+ (count ?G bio-current-seq) 
					    (count ?C bio-current-seq) 
					    (count ?g bio-current-seq) 
					    (count ?t bio-current-seq))) 
				(length bio-current-seq))))
	   )
  )
(defun bio-is-nt-gc (nt)
  (cond 
   ((eq nt 67) 1)
   ((eq nt 71) 1)
   ((eq nt 99) 1)
   ((eq nt 103) 1)
   (t 0)))
(defun bio-count-gc (seq)
  (cond
   ((eq (mod (length seq) 3) 2) (setq seq (concat seq "n")))
   ((eq (mod (length seq) 3) 1) (setq seq (concat seq "nn")))
   )
  (let ((counter 0)
	(gc1 0)
	(gc2 0)
	(gc3 0)
	)
    (loop
     (if (>= counter (length seq)) (return (list gc1 gc2 gc3)))
     (setq gc1 (+ gc1 (bio-is-nt-gc (char-to-int (elt seq counter)))))
     (setq gc2 (+ gc2 (bio-is-nt-gc (char-to-int (elt seq (+ counter 1))))))
     (setq gc3 (+ gc3 (bio-is-nt-gc (char-to-int (elt seq (+ counter 2))))))
     (setq counter (+ counter 3))
     )))
(setq bio-current-seq "ATGGCAAATTCTTTCGTAAAAAGATGGT")
(bio-count-gc bio-current-seq)


(defun bio-ATrich-over-GCrich-codons (seq)
  "Calculates the ratio of amino acids coded for by AT-rich
codons over amino acids coded for by GC-rich codons" 
;  (interactive "r")
  (setq seq (substitute ?T ?U (upcase seq)))
  (cond
   ((eq (mod (length seq) 3) 2) (setq seq (concat seq "N")))
   ((eq (mod (length seq) 3) 1) (setq seq (concat seq "NN")))
   )
  (let ((counter 0)
	(nAT 0)
	(nGC 0)
	(AT-rich-codons '("TAT" "TAC" "TTT" "TTC" "AAT" "AAC" "AAG" "AAA" "ATT" "ATC" "ATA"))
	(GC-rich-codons '("CCA" "CCT" "CCG" "CCC" "GGA" "GGT" "GGG" "GGC" "GCA" "GCT" "GCG" "GCC"))
	(codon "NNN")
	)
    (loop
     (if (>= counter (length seq)) (return (if (zerop nGC) "not possible" (/ nAT (float nGC)))))
     (setq codon (substring seq counter (+ counter 3)))
     (if (member codon AT-rich-codons) (setq nAT (1+ nAT)))
     (if (member codon GC-rich-codons) (setq nGC (1+ nGC)))
;     (insert-string (format  "nAT %d nGC %d" nAT nGC)) ;;debug
     (setq counter (+ counter 3))
     )))




(defun bio-seqinfo (beginning end)
  "Calculates seqinfo"
  (interactive "r")
  (setq bio-current-seq (bio-region-to-seq (buffer-substring beginning end)))
  (let ((nA (+(count ?a bio-current-seq) (count ?A bio-current-seq) ))
	(nT (+(count ?t bio-current-seq) (count ?T bio-current-seq) ))
	(nG (+(count ?g bio-current-seq) (count ?G bio-current-seq) ))
	(nC (+(count ?c bio-current-seq) (count ?C bio-current-seq) ))
	)	 
    (message "%d nt, GC=%.2f%%, GC3=%.2f%%, A=%d, T=%d, G=%d, C=%d, nAT/nGC=%.2f"
	     (length bio-current-seq)
	     (/ (* 100.0 (+ nG nC))	(length bio-current-seq))
	     (/ (* (elt (bio-count-gc bio-current-seq) 2) 300.0) (length bio-current-seq))
	     nA
	     nT
	     nG
	     nC
	     (bio-ATrich-over-GCrich-codons bio-current-seq)
	     )
    )
  )




;;}}}
;;{{{ Sequence Input/Output

(defun bio-pretty-output (seq buffer width)
  ;; prints the sequence seq in #width blocks
  ;; most used for FASTA output
  (let* ((count 0)
	 (max 1)
	 (from-buffer (current-buffer)))
    (while (<= count (length seq))
      (if (>= (+ count width) (length seq))
	  (setq max (+ (length seq) 1))
	(setq max (+ max width)))
      (pop-to-buffer buffer)
      (insert-string (concat (substring seq count (- max 1)) "\n"))
      (setq count (+ count width))
      )
    (pop-to-buffer from-buffer)
    )
  )

(defun bio-readseq (file)
  "Reads a sequence file via a readseq pipe, and discards all
newlines"
  (interactive "f")
  (let* ((seq (exec-to-string (concat "readseq -p -fraw " file)))
	 (buffer-name (generate-new-buffer-name (concat "Readseq:" file))))
    (progn
      (get-buffer-create buffer-name)
      (pop-to-buffer buffer-name)
      (insert-string (replace-in-string seq "\n" ""))))
  )
(defun bio-writeseq (beginning end format file)
  "A wrapper function for readseq, takes a marked region,
removes all non-sequence chars ([ \\t\\n0-9]), asks for a sequence
format (Tab shows all available formats) and finally asks for
a file to write to."
  (interactive (list (region-beginning) ; sequence start
		     (region-end)	; sequence end
		     (let ((completion-ignore-case t) ; sequence format
			   (prompt "Format( press Tab for available formats): "))
		       (completing-read
			prompt
			'(("IG/Stanford" 1) ("Stanford" 1) ("IG" 1)
			  ("GenBank" 2) ("NBRF" 3) ("EMBL" 4) ("GCG" 5) 
			  ("DNAStrider" 6) ("Fitch" 7) ("Fasta" 8) ("Phylip3.2" 11)
			  ("Phylip" 12) ("Raw" 13) ("Plain" 13) ("PIR" 14)
			  ("CODATA" 14)("MSF" 15) ("ASN.1" 16) ("PAUP" 17)
			  ("NEXUS" 17) ("Pretty" 18))
			nil t ))
		     (expand-file-name (read-file-name "File: ")) ; sequence file name
		     ))
  (let* ((seq (bio-region-to-seq (buffer-substring beginning end)))
	 (buffer-name (generate-new-buffer-name "Readseq_w:"))
	 (readseq-command (concat "readseq -p -f" format " " file))
	 )
    (cond ((and (null (file-exists-p file)) (file-writable-p file))
	   (find-file file)
	   (insert-string "> BioEmacs \n" )
	   (bio-pretty-output seq (current-buffer) 50)
	   (save-buffer)
	   (setq bio-current-seq (exec-to-string readseq-command))
	   (delete-region (point-min) (point-max))
	   (insert-string bio-current-seq)
	   (save-buffer))
	  ((message (concat "Error writing " file  ", maybe exists ?"))))
    )
  )

;;}}}
;;{{{ Translation, sequence editing
(defun bio-region-to-seq (region-string)
  "Discards all non (sequence) valid characters and strings
like \(FT     , newlines, tabs, spaces and digits"
  (replace-in-string 
   (replace-in-string region-string  "FT[ \t]+" "")
   "[ \t\n0-9]" ""))

(defun bio-get-sequence (beginning end)
  "extracts the sequence out of the buffer"
  (interactive "r")
  (setq bio-current-seq (bio-region-to-seq (buffer-substring beginning end)))
  (setq bio-seq-file (buffer-name))
  (setq buffer-name (generate-new-buffer-name "*sequence*"))
  (get-buffer-create buffer-name)
  (pop-to-buffer buffer-name)
  (bio-pretty-output bio-current-seq
		     buffer-name 50)
  )
(defun bio-check-seq-type (seq)
  "Checks whether the sequence contains nucleotides or amino acids
returns \"nt\" or \"aa\""
  (setq seq (downcase seq))
  (setq bio-acgt (+ (count ?a seq) (count ?c seq) (count ?g seq) (count ?t seq)))
  (setq bio-acgt  (/ (* 100 bio-acgt) (length seq)))
  (if (>= bio-acgt 80) "nt" "aa"))
(defun bio-translate-codon (codon)
  (bio-gethash (downcase codon) bio-gencode-table))
(defun bio-translate-string (seq)
  (setq bio-aa-seq "")
  (while (>= (length seq) 3)
    (progn
      (setq bio-aa-seq (concat bio-aa-seq 
			       (bio-translate-codon 
				(substring seq 0 3))))
      (setq seq (substring seq 3))
      ))
  bio-aa-seq
  )
(defun bio-puthash (key val table)
  (let ((sym (if (stringp key) (intern key) key)))
    (puthash sym val table)))

(defun bio-gethash (key table &optional default)
  (let ((sym (if (stringp key) (intern-soft key) key)))
    (if (not sym)
	default
      (gethash sym table)
      )))
(defun bio-set-genetic-code ()
  (bio-puthash "aaa" "k" bio-gencode-table)
  (bio-puthash "aac" "n" bio-gencode-table)
  (bio-puthash "aag" "k" bio-gencode-table)
  (bio-puthash "aat" "n" bio-gencode-table)

  (bio-puthash "aca" "t" bio-gencode-table)
  (bio-puthash "acc" "t" bio-gencode-table)
  (bio-puthash "acg" "t" bio-gencode-table)
  (bio-puthash "act" "t" bio-gencode-table)

  (bio-puthash "aga" "r" bio-gencode-table)
  (bio-puthash "agc" "s" bio-gencode-table)
  (bio-puthash "agg" "r" bio-gencode-table)
  (bio-puthash "agt" "s" bio-gencode-table)

  (bio-puthash "ata" "i" bio-gencode-table)
  (bio-puthash "atc" "i" bio-gencode-table)
  (bio-puthash "atg" "m" bio-gencode-table)
  (bio-puthash "att" "i" bio-gencode-table)

  (bio-puthash "caa" "q" bio-gencode-table)
  (bio-puthash "cac" "h" bio-gencode-table)
  (bio-puthash "cag" "q" bio-gencode-table)
  (bio-puthash "cat" "h" bio-gencode-table)

  (bio-puthash "cca" "p" bio-gencode-table)
  (bio-puthash "ccc" "p" bio-gencode-table)
  (bio-puthash "ccg" "p" bio-gencode-table)
  (bio-puthash "cct" "p" bio-gencode-table)

  (bio-puthash "cga" "r" bio-gencode-table)
  (bio-puthash "cgc" "r" bio-gencode-table)
  (bio-puthash "cgg" "r" bio-gencode-table)
  (bio-puthash "cgt" "r" bio-gencode-table)

  (bio-puthash "cta" "l" bio-gencode-table)
  (bio-puthash "ctc" "l" bio-gencode-table)
  (bio-puthash "ctg" "l" bio-gencode-table)
  (bio-puthash "ctt" "l" bio-gencode-table)

  (bio-puthash "gaa" "e" bio-gencode-table)
  (bio-puthash "gac" "d" bio-gencode-table)
  (bio-puthash "gag" "e" bio-gencode-table)
  (bio-puthash "gat" "d" bio-gencode-table)

  (bio-puthash "gca" "a" bio-gencode-table)
  (bio-puthash "gcc" "a" bio-gencode-table)
  (bio-puthash "gcg" "a" bio-gencode-table)
  (bio-puthash "gct" "a" bio-gencode-table)

  (bio-puthash "gga" "g" bio-gencode-table)
  (bio-puthash "ggc" "g" bio-gencode-table)
  (bio-puthash "ggg" "g" bio-gencode-table)
  (bio-puthash "ggt" "g" bio-gencode-table)

  (bio-puthash "gta" "v" bio-gencode-table)
  (bio-puthash "gtc" "v" bio-gencode-table)
  (bio-puthash "gtg" "v" bio-gencode-table)
  (bio-puthash "gtt" "v" bio-gencode-table)

  (bio-puthash "taa" "*" bio-gencode-table)
  (bio-puthash "tac" "y" bio-gencode-table)
  (bio-puthash "tag" "*" bio-gencode-table)
  (bio-puthash "tat" "y" bio-gencode-table)

  (bio-puthash "tca" "s" bio-gencode-table)
  (bio-puthash "tcc" "s" bio-gencode-table)
  (bio-puthash "tcg" "s" bio-gencode-table)
  (bio-puthash "tct" "s" bio-gencode-table)

  (bio-puthash "tga" "*" bio-gencode-table)
  (bio-puthash "tgc" "c" bio-gencode-table)
  (bio-puthash "tgg" "w" bio-gencode-table)
  (bio-puthash "tgt" "c" bio-gencode-table)

  (bio-puthash "tta" "l" bio-gencode-table)
  (bio-puthash "ttc" "f" bio-gencode-table)
  (bio-puthash "ttg" "l" bio-gencode-table)
  (bio-puthash "ttt" "f" bio-gencode-table)
  (bio-puthash "---" "-" bio-gencode-table)

  (bio-puthash "current-code" "universal" bio-gencode-table)
  )
(defun bio-translate-region (beginning end)
  (interactive "r")
  (setq bio-current-seq (bio-region-to-seq (buffer-substring beginning end)))
  (setq bio-seq-file (buffer-name))
  (setq buffer-name (generate-new-buffer-name "*translation*"))
  (get-buffer-create buffer-name)
  (pop-to-buffer buffer-name)
  (if (eq (bio-gethash "current-code" bio-gencode-table) nil)
      (bio-set-genetic-code))
  (if (string= "aa" (bio-check-seq-type 
		     (substring bio-current-seq 0 
				(cond 
				 ((>= (length bio-current-seq) 10) 10)
				 ((length seq))))))
      (insert-string "Warning !! Probably amino acid sequence !!!\n"))
  (insert-string (concat "> " bio-seq-file " " (number-to-string beginning)
			 " " (number-to-string end) "\n"))
  (bio-pretty-output (bio-translate-string bio-current-seq) 
		     buffer-name 50)
  )
(defun bio-reverse (seq)
  "returns the inverted sequence !!! not the antiparallel !!!"
  (let* ((i (length seq))
	 (rseq nil)
	 )
    (while (not (= i 0))
      (setq rseq (concat rseq (char-to-string (elt seq (- i 1)))))
      (setq i (- i 1))
      )
    rseq))
(defun bio-anti (nt)
  (cond
   ((eq nt ?a) "t")
   ((eq nt ?A) "T")
   ((eq nt ?t) "a")
   ((eq nt ?T) "A")
   ((eq nt ?c) "g")
   ((eq nt ?C) "G")
   ((eq nt ?g) "c")
   ((eq nt ?G) "C")
   ((eq nt ?M) "K")
   ((eq nt ?m) "k")
   ((eq nt ?R) "Y")
   ((eq nt ?r) "y")
   ((eq nt ?W) "W")
   ((eq nt ?w) "w")
   ((eq nt ?S) "S")
   ((eq nt ?s) "s")
   ((eq nt ?Y) "R")
   ((eq nt ?y) "r")
   ((eq nt ?K) "M")
   ((eq nt ?k) "m")
   ((eq nt ?V) "B")
   ((eq nt ?v) "b")
   ((eq nt ?H) "D")
   ((eq nt ?h) "d")
   ((eq nt ?D) "H")
   ((eq nt ?d) "h")
   ((eq nt ?B) "V")
   ((eq nt ?b) "v")
   ((eq nt ?N) "N")
   ((eq nt ?n) "n")		
   (?x)
   ))
(defun bio-antiparallel (seq)
  (mapconcat #'bio-anti (bio-reverse seq) nil)
  )
(defun bio-complement (seq)
  (mapconcat #'bio-anti seq nil)
  )
(defun bio-reverse-region (beginning end)   
  (interactive "r")
  (let* ((seq (bio-region-to-seq (buffer-substring beginning end))))
    (progn
      (delete-region beginning end (current-buffer))
      (bio-insert-string (bio-reverse seq))
      )
    ))
(defun bio-antiparallel-region (beginning end)   
  (interactive "r")
  (let* ((seq (bio-region-to-seq (buffer-substring beginning end))))
    (progn
      (delete-region beginning end (current-buffer))
      (bio-insert-string (bio-antiparallel seq))
      )
    ))
(defun bio-complement-region (beginning end)   
  (interactive "r")
  (let* ((seq (bio-region-to-seq (buffer-substring beginning end))))
    (progn
      (delete-region beginning end (current-buffer))
      (bio-insert-string (bio-complement seq))
      )
    ))

;;}}}
;;{{{ Triplets


;; does not work ?
(defun bio-every-triplet (fn seq)
  "Apply FUNCTION to each triplet of SEQUENCE, and make a list of the results."
  (cond
   ((endp seq) nil) 
   ((cons (funcall fn (if (> (length seq) 3) 
			  (substring seq 0 3) 
			(substring seq 0)))
	  (bio-every-triplet fn 
			      (if (> (length seq) 3) 
				  (substring seq 3) 
				nil))
	  ))))

;;}}}


;;{{{ Menu
(defconst bio-sequence-editing-menu
  '("Sequence Editing"
    ["Translate"  bio-translate-region t]
    ["Reverse" bio-reverse-region t]
    ["Antiparallel"  bio-antiparallel-region t]
    ["Complementary"  bio-complement-region t]
    ["Region -> Sequence"  bio-region-to-seq t]))
(defconst bio-converting-menu
  '("Converting"
    ["EMBL -> GTF" bio-convert-embl2-gtf t]
    ["EMBL extract" bio-embl-extract t]))
(defconst bio-readwrite-menu
  '("Sequence Input/Output"
    ["Read Sequence from file" bio-readseq t]
    ["Write Sequence from file" bio-writeseq t]))
(defconst bio-popup-menu-3
  '("Bio Menu"
    ["Oh Bugger, Undo !!!" advertised-undo t]
    ["---" nil t]
    ["Translate"  bio-translate-region t]
    ["Reverse" bio-reverse-region t]
    ["Antiparallel"  bio-antiparallel-region t]
    ["Complementary"  bio-complement-region t]
    ["Region -> Sequence"  bio-region-to-seq t]
    ["---" nil t]
    ["EMBL -> GTF" bio-convert-embl2-gtf t]
    ["EMBL extract" bio-embl-extract t]
    ["---" nil t]
    ["Read Sequence from file" bio-readseq t]
    ["Write Sequence from file" bio-writeseq t]
    ["---" nil t]
    ["Blast on region" bio-blast-on-region t]
    ["Search oligo" bio-search-forward t]
    ["GC%" bio-gccontent-on-region t]))

(defvar bio-mode-map ()
  "Keymap used in Bio mode.")

(defun bio-popup-menu-3 (e)
  (interactive "@e")
  (popup-menu bio-popup-menu-3))



(defun bio-menubar ()
  (add-submenu '("Bio") bio-sequence-editing-menu)
  (add-submenu '("Bio") bio-converting-menu)
  (add-submenu '("Bio") bio-readwrite-menu)
  (add-menu-button '("Bio") ["---" nil t])
  (add-menu-button '("Bio") ["Blast on region" bio-blast-on-region t])
  (add-menu-button '("Bio") ["Search oligo" bio-search-forward t])
  (add-menu-button '("Bio") ["GC%" bio-gccontent-on-region t])

  (setq bio-mode-map (make-sparse-keymap))
  (define-key bio-mode-map [button3] 'bio-popup-menu-3)
  (use-local-map bio-mode-map)
  
  )

;;}}}


(defun bio-mode ()
  "Major mode for manipulating biological data"
  (interactive)
  (setq mode-name "Bio-mode")
  (setq major-mode 'bio-mode)
  (cond ((xemacs-p)
	 (bio-menubar)))
  )
(provide 'bio-mode)
