;;; char-db.el --- Character Database utility

;; Copyright (C) 1998 MORIOKA Tomohiko.

;; Author: MORIOKA Tomohiko <morioka@jaist.ac.jp>
;; Keywords: ISO/IEC 10646, Unicode, UCS-4, UTF-8, MULE.

;; This file is part of XEmacs-UCS.

;; XEmacs-UCS is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published
;; by the Free Software Foundation; either version 2, or (at your
;; option) any later version.

;; XEmacs-UCS is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with XEmacs; see the file COPYING.  If not, write to the Free
;; Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
;; 02111-1307, USA.

;;; Code:

(require 'xemacs-ucs)

(defvar unicode-data-path
  (let ((default-load-path '("/pub/misc/character/tables/unicode/"
			     "/ftp@ftp.unicode.org:/Public/"))
	path)
    (setq path (get-latest-path "Update"))
    (if path
	(car (sort (directory-files
		    path t "UnicodeData" t)
		   (function file-newer-than-file-p))))))

(defvar ideograph-variants-data-path
  (let ((default-load-path
	  '("/pub/misc/character/tables/"
	    "/ftp@ftp.jaist.ac.jp:/pub/misc/character/tables/"))
	path)
    (setq path (get-latest-path "yasuoka"))
    (if path
	(car (sort (directory-files
		    path t "Variants.Z" t)
		   (function file-newer-than-file-p))))))

(defun ucs-decode-general-category (cat)
  (cdr
   (assoc
    cat
    '(;; Normative
      ("Mn" . "Mark, Non-Spacing")
      ("Mc" . "Mark, Spacing Combining")
      ("Me" . "Mark, Enclosing")
      ("Nd" . "Number, Decimal Digit")
      ("Nl" . "Number, Letter")
      ("No" . "Number, Other")
      ("Zs" . "Separator, Space")
      ("Zl" . "Separator, Line")
      ("Zp" . "Separator, Paragraph")
      ("Cc" . "Other, Control")
      ("Cf" . "Other, Format")
      ("Cs" . "Other, Surrogate")
      ("Co" . "Other, Private Use")
      ("Cn" . "Other, Not Assigned")
      ;; Informative
      ("Lu" . "Letter, Uppercase")
      ("Ll" . "Letter, Lowercase")
      ("Lt" . "Letter, Titlecase")
      ("Lm" . "Letter, Modifier")
      ("Lo" . "Letter, Other")
      
      ("Pc" . "Punctuation, Connector")
      ("Pd" . "Punctuation, Dash")
      ("Ps" . "Punctuation, Open")
      ("Pe" . "Punctuation, Close")
      ("Pi" . "Punctuation, Initial quote")
      ("Pf" . "Punctuation, Final quote")
      ("Po" . "Punctuation, Other")
      
      ("Sm" . "Symbol, Math")
      ("Sc" . "Symbol, Currency")
      ("Sk" . "Symbol, Modifier")
      ("So" . "Symbol, Other")
      ))))

(defun ucs-decode-canonical-combining-class (class)
  (cdr
   (assoc
    class
    '(("0" . "Spacing, enclosing, reordrant, and surrounding")
      ("1" . "Overlays and interior")
      ("6" . "Tibetan subjoined Letters")
      ("7" . "Nuktas")
      ("8" . "Hiragana/Katakana voiced marks")
      ("9" . "Viramas")
      ("10" . "Start of fixed position classes")
      ("199" . "End of fixed position classes")
      ("200" . "Below left attached")
      ("202" . "Below attached")
      ("204" . "Below right attached")
      ("208" . "Left attached (reordrant around single base character)")
      ("210" . "Right attached")
      ("212" . "Above left attached")
      ("214" . "Above attached")
      ("216" . "Above right attached")
      ("218" . "Below left")
      ("220" . "Below")
      ("222" . "Below right")
      ("224" . "Left (reordrant around single base character)")
      ("226" . "Right")
      ("228" . "Above left")
      ("230" . "Above")
      ("232" . "Above right")
      ("234" . "Double above")
      ))))

(defun ucs-decode-bidirectional-property (d)
  (cdr
   (assoc
    d
    '(;; Strong types:
      ("L" . "Left-Right")
      ("R" . "Right-Left")
      ;;Weak types:
      ("EN" . "European Number")
      ("ES" . "European Number Separator")
      ("ET" . "European Number Terminator")
      ("AN" . "Arabic Number")
      ("CS" . "Common Number Separator")

      ;;Separators:
      ("B" . "Block Separator")
      ("S" . "Segment Separator")

      ;;Neutrals:
      ("WS" . "Whitespace")
      ("ON" . "Other Neutrals")
      ))))

(defun ucs-decode-mirror-able (m)
  (if (string= m "Y")
      "Yes"
    "No"))

(defun yasuoka-decode-char-representation (char-rep)
  (cond ((string-match "^JIS78-\\([0-9][0-9]\\)\\([0-9][0-9]\\)" char-rep)
	 (let ((h (+ (string-to-int (match-string 1 char-rep)) 32))
	       (l (+ (string-to-int (match-string 2 char-rep)) 32)))
	   (format "J78-@-%02X%02X (%c)"
		   h l (make-char 'japanese-jisx0208-1978 h l))
	   ))
	((string-match "^JIS83-\\([0-9][0-9]\\)\\([0-9][0-9]\\)" char-rep)
	 (let ((h (+ (string-to-int (match-string 1 char-rep)) 32))
	       (l (+ (string-to-int (match-string 2 char-rep)) 32)))
	   (format "J83-B-%02X%02X (%c)"
		   h l (make-char 'japanese-jisx0208 h l))
	   ))
	((string-match "^JIS90-\\([0-9][0-9]\\)\\([0-9][0-9]\\)" char-rep)
	 (let ((h (+ (string-to-int (match-string 1 char-rep)) 32))
	       (l (+ (string-to-int (match-string 2 char-rep)) 32)))
	   (format "J90@B-%02X%02X" h l)
	   ))
	((string-match "^JIS-\\([0-9][0-9]\\)\\([0-9][0-9]\\)" char-rep)
	 (let ((h (+ (string-to-int (match-string 1 char-rep)) 32))
	       (l (+ (string-to-int (match-string 2 char-rep)) 32)))
	   (format "JIS-B-%02X%02X (%c)"
		   h l (make-char 'japanese-jisx0208 h l))
	   ))
	((string-match "^JIS\\+-\\([0-9][0-9]\\)\\([0-9][0-9]\\)" char-rep)
	 (let ((h (+ (string-to-int (match-string 1 char-rep)) 32))
	       (l (+ (string-to-int (match-string 2 char-rep)) 32)))
	   (format "J90-D-%02X%02X (%c)"
		   h l (make-char 'japanese-jisx0212 h l))
	   ))
	((string-match "^GB-\\([0-9][0-9]\\)\\([0-9][0-9]\\)" char-rep)
	 (let ((h (+ (string-to-int (match-string 1 char-rep)) 32))
	       (l (+ (string-to-int (match-string 2 char-rep)) 32)))
	   (format "GB0-A-%02X%02X (%c)"
		   h l (make-char 'chinese-gb2312 h l))
	   ))
	((string-match "^GB\\([1-5]\\)-\\([0-9][0-9]\\)\\([0-9][0-9]\\)"
		       char-rep)
	 (let ((p (+ (string-to-int (match-string 1 char-rep))))
	       (h (+ (string-to-int (match-string 2 char-rep)) 32))
	       (l (+ (string-to-int (match-string 3 char-rep)) 32)))
	   (format "GB%d-?-%02X%02X" p h l)
	   ))
	((string-match "^GB\\+-\\([0-9][0-9]\\)\\([0-9][0-9]\\)" char-rep)
	 (let ((h (+ (string-to-int (match-string 1 char-rep)) 32))
	       (l (+ (string-to-int (match-string 2 char-rep)) 32)))
	   (format "GB+-E-%02X%02X (%c)"
		   h l (make-char 'chinese-isoir165 h l))
	   ))
	((string-match
	  "^CNS\\([1-7]\\)-\\([0-9A-F][0-9A-F]\\)\\([0-9A-F][0-9A-F]\\)"
	  char-rep)
	 (let* ((p (string-to-int (match-string 1 char-rep)))
		(h (string-to-int (match-string 2 char-rep) 16))
		(l (string-to-int (match-string 3 char-rep) 16))
		(charset (intern (format "chinese-cns11643-%d" p))))
	   (format "C%02d-%c-%02X%02X (%c)"
		   p (charset-final charset)
		   h l (make-char charset h l))
	   ))
	((string-match
	  "^BIG5-\\([0-9A-F][0-9A-F]\\)\\([0-9A-F][0-9A-F]\\)"
	  char-rep)
	 (let* ((h (string-to-int (match-string 1 char-rep) 16))
		(l (string-to-int (match-string 2 char-rep) 16)))
	   (format "BIG5-%02X%02X (%c)"
		   h l
		   (decode-big5-char (cons h l)))
	   ))
	(t char-rep)
	))

;;;###autoload
(defun what-character (char)
  (interactive (list (char-after)))
  (let ((buf (get-buffer-create "*Character Description*"))
	(the-buf (current-buffer))
	(win-conf (current-window-configuration))
	ychar)
    (pop-to-buffer buf)
    (make-local-variable 'what-character-original-window-configuration)
    (setq what-character-original-window-configuration win-conf)
    (setq buffer-read-only nil)
    (erase-buffer)
    (let (charset h l)
      (let ((sc (split-char char)))
	(setq charset (car sc)
	      h (nth 1 sc)
	      l (nth 2 sc)))
      (insert (format "%c\n    %s: " char charset))
      (insert (if l
		  (format "%02d-%02d, #x%02X%02X, %d\n"
			  (- h 32) (- l 32) h l (+ (lsh h 8) l))
		(format "%d/%d, #x%02X, %d\n"
			(lsh h -4) (logand h 15) h h)))
      (insert (format "    (%s)\n" (charset-description charset)))
      (let ((ucs (char-ucs char)))
	(when ucs
	  (insert (format "    UCS: %08X" ucs))
	  (if unicode-data-path
	      (let ((str
		     (with-current-buffer
			 (find-file-noselect unicode-data-path)
		       (goto-char (point-min))
		       (if (re-search-forward (format "^%04X;" ucs) nil t)
			   (buffer-substring (match-end 0)(point-at-eol))
			 ))))
		(if str
		    (let ((data (split-string str ";")))
		      (insert (format " (%s)\n" (nth 0 data)))
		      (insert
		       (format "    General Category: %s\n"
			       (ucs-decode-general-category (nth 1 data))))
		      (or (string= (nth 2 data) "")
			  (insert
			   (format "    Canonical Combining Classes: %s\n"
				   (ucs-decode-canonical-combining-class
				    (nth 2 data)))))
		      (or (string= (nth 3 data) "")
			  (insert (format "    Bidirectional Category: %s\n"
					  (ucs-decode-bidirectional-property
					   (nth 3 data)))))
		      (or (string= (nth 4 data) "")
			  (insert (format "    Character Decomposition: %s\n"
					  (nth 4 data))))
		      (or (string= (nth 5 data) "")
			  (insert (format "    Decimal digit value: %s\n"
					  (nth 5 data))))
		      (or (string= (nth 6 data) "")
			  (insert (format "    Digit value: %s\n"
					  (nth 6 data))))
		      (or (string= (nth 7 data) "")
			  (insert (format "    Numeric value: %s\n"
					  (nth 7 data))))
		      (or (string= (nth 8 data) "")
			  (insert (format "    Mirror-able: %s\n"
					  (ucs-decode-mirror-able
					   (nth 8 data)))))
		      (or (string= (nth 9 data) "")
			  (insert (format "    Unicode 1.0 Name: %s\n"
					  (nth 9 data))))
		      (or (string= (nth 10 data) "")
			  (insert (format "    10646 Comment: %s\n"
					  (nth 10 data))))
		      (or (string= (nth 11 data) "")
			  (insert
			   (format "    Uppercase = %s (%c)\n"
				   (nth 11 data)
				   (ucs-char
				    (string-to-int (nth 11 data) 16)))))
		      (or (string= (nth 12 data) "")
			  (insert
			   (format "    Lowercase = %s (%c)\n"
				   (nth 12 data)
				   (ucs-char
				    (string-to-int (nth 12 data) 16)))))
		      (or (string= (nth 13 data) "")
			  (insert
			   (format "    Title-case = %s (%c)\n"
				   (nth 13 data)
				   (ucs-char
				    (string-to-int (nth 13 data) 16)))))
		      )
		  (insert "\n"))
		))
	  (if (and (<= #x4E00 ucs) (<= ucs #x9FA5)
		   ideograph-variants-data-path)
	      (let ((str
		     (with-current-buffer
			 (find-file-noselect ideograph-variants-data-path)
		       (goto-char (point-min))
		       (if (re-search-forward (format "UCS-%04X" ucs) nil t)
			   (buffer-substring (point-at-bol)(point-at-eol))
			 ))))
		(if str
		    (insert
		     (concat
		      "    Variants: "
		      (mapconcat
		       (lambda (v)
			 (mapconcat #'yasuoka-decode-char-representation
				    (split-string v ",")
				    ", ")
			 )
		       (split-string str " ") "\n              ")
		      "\n")))
		))
	  )))
    (set-buffer-modified-p nil)
    (view-mode the-buf (lambda (buf)
			 (set-window-configuration
			  what-character-original-window-configuration)
			 ))
    (goto-char (point-min))))

(provide 'char-db)

;;; char-db.el ends here
