cl-rw

Layered streams for Common Lisp
git clone https://logand.com/git/cl-rw.git/
Log | Files | Refs

xml.lisp (6349B)


      1 ;;; Copyright (C) 2013, 2014 Tomas Hlavaty <tom@logand.com>
      2 ;;;
      3 ;;; Permission is hereby granted, free of charge, to any person
      4 ;;; obtaining a copy of this software and associated documentation
      5 ;;; files (the "Software"), to deal in the Software without
      6 ;;; restriction, including without limitation the rights to use, copy,
      7 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
      8 ;;; of the Software, and to permit persons to whom the Software is
      9 ;;; furnished to do so, subject to the following conditions:
     10 ;;;
     11 ;;; The above copyright notice and this permission notice shall be
     12 ;;; included in all copies or substantial portions of the Software.
     13 ;;;
     14 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     15 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     16 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     17 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
     18 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
     19 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     20 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21 ;;; DEALINGS IN THE SOFTWARE.
     22 
     23 (defpackage :rw.xml
     24   (:use :cl)
     25   (:export :xmarkup-reader
     26            :parse-xml))
     27 
     28 (in-package :rw.xml)
     29 
     30 ;; https://github.com/drewc/smug/blob/master/smug.org
     31 ;; http://www.htmlhelp.com/reference/wilbur/misc/comment.html
     32 
     33 ;; TODO xml is made of bytes, not chars
     34 
     35 (defun parse-xml-attributes (reader finish) ;; finish='((#\/))|'((#\?))
     36   (flet ((peek () (rw:peek reader))
     37          (next () (rw:next reader))
     38          (skip () (rw:skip reader))
     39          (till (markers) (rw:till reader markers)))
     40     (do (z)
     41         ((progn (skip) (eql #\> (peek)))
     42          (next)
     43          (let ((f (equal (car z) finish)))
     44            (when f (pop z))
     45            (values (nreverse z) f)))
     46       (push (cons (prog1 (till '(#\space #\tab #\return #\newline #\> #\=))
     47                     (skip)
     48                     (when (eql #\= (peek))
     49                       (next)
     50                       (skip)))
     51                   (let ((q (peek)))
     52                     (case q
     53                       ((#\" #\')
     54                        (next)
     55                        (prog1 (till (list q))
     56                          (assert (eql q (next)))))
     57                       (t ;; TODO only when enabled no-quoted-attributes
     58                        (till '(#\space #\tab #\return #\newline #\>))))))
     59             z))))
     60 
     61 (defun xmarkup-reader (reader)
     62   (flet ((peek () (rw:peek reader))
     63          (next () (rw:next reader))
     64          (skip () (rw:skip reader))
     65          (till (markers) (rw:till reader markers)))
     66     (lambda ()
     67       (case (peek)
     68         ((nil))
     69         (#\<
     70          (next)
     71          (skip)
     72          (let ((e (till '(#\space #\tab #\return #\newline #\>))))
     73            (skip)
     74            (case (car e) ;; TODO doctype
     75              (#\?
     76               (multiple-value-bind (a f)
     77                   (parse-xml-attributes reader '((#\?)))
     78                 (assert f)
     79                 (cons :pi (cons (cdr e) a))))
     80              (#\!
     81               (prog1 (cons :comment (till '(#\>))) ;; TODO properly
     82                 (assert (eql #\> (next)))))
     83              (#\/
     84               (assert (eql #\> (next)))
     85               (cons :end (cdr e)))
     86              (t
     87               (multiple-value-bind (a f)
     88                   (parse-xml-attributes reader '((#\/)))
     89                 (unless f
     90                   (when (equal '(#\/) (last e))
     91                     (setq f t
     92                           e (nreverse (cdr (nreverse e)))))) ;; TODO better
     93                 (cons (if f :begin/ :begin) (cons e a)))))))
     94         (t (cons :text (till '(#\<)))))))) ;; TODO entities
     95 
     96 (defun parse-xml (x &optional upcasep package)
     97   (labels ((id (x)
     98              (intern (if upcasep
     99                          (string-upcase (concatenate 'string x))
    100                          (concatenate 'string x))
    101                      (or package :keyword)))
    102            (xattrs (x)
    103              (loop
    104                 for (f . r) in x
    105                 appending (list (id f) (concatenate 'string r))))
    106            (parse (r)
    107              (do ((z (list nil))
    108                   (r (xmarkup-reader (rw:skip r)))
    109                   a)
    110                  ((not (setq a (rw:next r)))
    111                   (let ((y (pop z)))
    112                     (assert (not z))
    113                     (assert (not (cdr y)))
    114                     (car y)))
    115                (ecase (car a)
    116                  (:pi)
    117                  (:comment)
    118                  (:begin/
    119                   (let ((tag (id (cadr a)))
    120                         (attrs (xattrs (cddr a))))
    121                     (push (list (if attrs (cons tag attrs) tag)) (car z))))
    122                  (:begin
    123                   (let ((tag (cadr a))
    124                         (attrs (xattrs (cddr a))))
    125                     (push (list (if attrs (cons tag attrs) tag)) z))
    126                   (push nil z))
    127                  (:end
    128                   (let ((tag (cdr a))
    129                         (b (nreverse (pop z)))
    130                         (e (pop z)))
    131                     (assert e)
    132                     (assert z)
    133                     (let* ((h (car e))
    134                            (tag2 (if (atom (car h)) h (car h)))
    135                            (attrs (unless (atom (car h)) (cdr h))))
    136                       (assert (equal tag tag2))
    137                       (push (cons (if attrs (cons (id tag) attrs) (id tag)) b)
    138                             (car z)))))
    139                  (:text
    140                   (when (and (cdr z)
    141                              (find-if-not
    142                               (lambda (c)
    143                                 (or (member c '(#\space #\tab #\return #\newline))))
    144                               (cdr a)))
    145                     (push (string-trim '(#\space #\tab #\return #\newline)
    146                                        (concatenate 'string (cdr a)))
    147                           (car z))))))))
    148     (etypecase x
    149       (function (parse x))
    150       (stream (parse (rw:peek-reader (rw:char-reader x))))
    151       ((or list vector) (parse (rw:peek-reader (rw:reader x))))
    152       (pathname (with-open-file (s x)
    153                   (parse (rw:peek-reader (rw:char-reader s))))))))
    154 
    155 ;;(parse-xml "<rss><ahoj/>hi<cau></cau><br x='314'/></rss>")
    156 ;;(parse-xml "<rss a='1' b='2'>hi<br/></rss>")
    157 ;;(parse-xml "<rss>hi<br/></hello>")
    158 ;;(parse-xml #p"/home/tomas/git/cl-rw/a.xml")