cl-rw

Layered streams for Common Lisp
git clone https://logand.com/git/cl-rw.git/
Log | Files | Refs

commit 3fa532f2a7325f94a5798f86d77e8964df6e3434
parent 6e32c8bc706b32b633157d46f2ccd4feb9d37b6f
Author: Tomas Hlavaty <tom@logand.com>
Date:   Sun, 20 Sep 2015 18:15:40 +0200

utf8 codepoint reader added

Diffstat:
Mrw.lisp | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 53 insertions(+), 0 deletions(-)

diff --git a/rw.lisp b/rw.lisp @@ -44,6 +44,7 @@ :next-u32be :next-u32le :next-u8 + :next-utf8 :next-z0 :peek :peek-reader @@ -63,6 +64,7 @@ :u32 :u32be :u32le + :utf8-reader :wrap-reader :wrap-writer :write-octets @@ -334,6 +336,57 @@ (when x (parse-integer (coerce x 'string) :radix radix)))) +(defun next-utf8 (reader) + (let ((i1 (rw:next reader)) i2 i3 i4 o2 o3 o4) + (macrolet ((wrong () + `(error "wrong UTF-8 sequence ~x ~x ~x ~x" i1 i2 i3 i4)) + (tail (i o) + `(progn + (setq ,i (rw:next reader)) + (unless (and (typep ,i '(unsigned-byte 8)) + (= #x80 (logand #b11000000 ,i))) + (wrong)) + (setq ,o (logand #b00111111 ,i))))) + (cond + ((not i1) nil) + ((not (typep i1 '(unsigned-byte 8))) + (wrong)) + ((<= #b00000000 i1 #b01111111) ;; one + i1) + ((<= #b11000000 i1 #b11011111) ;; two + (tail i2 o2) + (let ((z (logior (ash (logand #x1f i1) 6) o2))) + (unless (<= #x000080 z #x0007ff) + (wrong)) + z)) + ((<= #b11100000 i1 #b11101111) ;; three + (tail i2 o2) + (tail i3 o3) + (let ((z (logior (ash (logand #x0f i1) 12) (ash o2 6) o3))) + (unless (or (<= #x000800 z #x00d7ff) + (<= #x00e000 z #x00ffff)) + (wrong)) + z)) + ((<= #b11110000 i1 #b11110111) ;; four + (tail i2 o2) + (tail i3 o3) + (tail i4 o4) + (let ((z (logior (ash (logand #x07 i1) 18) (ash o2 12) (ash o3 6) o4))) + (unless (<= #x010000 z #x10ffff) + (wrong)) + z)) + (t (wrong)))))) + +(defun utf8-reader (octet-reader) + (lambda () + (next-utf8 octet-reader))) + +;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#x24))))) +;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xc2 #xa2))))) +;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xe2 #x82 #xac))))) +;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xf0 #x90 #x8d #x88))))) +;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xc0 #x80))))) ;; overlong + (defun bit-reader (octet-reader) (let (octet bit) (lambda ()