emacs-unoffice

Emacs library to reclaim text from office documents (abw, odt, docx).
Log | Files | Refs

commit 64b1a9f808099a3985a01bf1e76ae516333b550a
parent 87ed4f79cbf3bcb84878245558db836b5ce1a97f
Author: Tomas Hlavaty <tom@logand.com>
Date:   Sun, 31 Jan 2021 22:42:34 +0100

introduce intermediate unoffice format

this allows me to separate import and export code

these should roughly work to various degree:

abw|odt|docx -> unoffice -> txt|org

other formats might be introduced later

Diffstat:
Memacs-unoffice.el | 414+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
1 file changed, 300 insertions(+), 114 deletions(-)

diff --git a/emacs-unoffice.el b/emacs-unoffice.el @@ -15,9 +15,14 @@ ;;; Example configuration: ;;; ;;; (require 'unoffice) -;;; (add-to-list 'auto-mode-alist '("\\.abw\\'" . unabw)) -;;; (add-to-list 'auto-mode-alist '("\\.docx\\'" . undocx)) -;;; (add-to-list 'auto-mode-alist '("\\.odt\\'" . unodt)) +;;; and associate file extensions +;;; (add-to-list 'auto-mode-alist '("\\.abw\\'" . unoffice-as-txt)) +;;; (add-to-list 'auto-mode-alist '("\\.docx\\'" . unoffice-as-txt)) +;;; (add-to-list 'auto-mode-alist '("\\.odt\\'" . unoffice-as-txt)) +;;; or +;;; (add-to-list 'auto-mode-alist '("\\.abw\\'" . unoffice-as-org)) +;;; (add-to-list 'auto-mode-alist '("\\.docx\\'" . unoffice-as-org)) +;;; (add-to-list 'auto-mode-alist '("\\.odt\\'" . unoffice-as-org)) ;;; ;;; Dependencies: ;;; @@ -60,136 +65,317 @@ z) file)) -(defun unabw () - (interactive) - (with-silent-modifications - (erase-buffer) - (cl-labels ((rec (x) +(defun unoffice--to-txt (x) + (assert (eq 'unoffice (car x))) + (fundamental-mode) + (let (table) + (cl-labels ((ins (x) + (when (and x (not (equal "" x))) + (insert x))) + (fill-previous (x) + (unless t ;;table + (when (cdr x) + (save-excursion + (backward-paragraph) + (when (bolp) + (forward-char)) + (end-of-line) + (message "%s" (current-column)) + (when (< fill-column (current-column)) + (fill-paragraph)))))) + (rec (x) + (typecase x + (string (ins x)) + (cons + (case (car x) + (pagebreak + (unless table + (unless (bolp) + (insert "\n")) + (insert " "))) + (td + (insert "|") + (mapc #'rec (cdr x))) + (tr + (mapc #'rec (cdr x)) + (insert "\n")) + (table + (fill-previous x) + (setq table t) + (mapc #'rec (cdr x)) + (setq table nil) + (org-table-align)) + (p + (fill-previous x) + (mapc #'rec (cdr x)) + (unless table + (insert "\n"))) + (t (mapc #'rec (cdr x)))))))) + (rec x)))) + +(defun unoffice--to-org (x) + (assert (eq 'unoffice (car x))) + (org-mode) + (insert "# -*- org -*-\n") + (insert "#+STARTUP: showeverything\n") + (insert "\n") + (let (table p1 p2 align) + (cl-labels ((ins (x) + (when (and x (not (equal "" x))) + (insert x))) + (rec (x) + (typecase x + (string (ins x)) + (cons + (case (car x) + (b + (when (bolp) ;; heading vs bold + (insert " ")) + (insert "*")) + (/b (insert "*")) + (i (insert "/")) + (/i (insert "/")) + (u (insert "_")) + (/u (insert "_")) + ((left center right) + (let ((a (car x))) + (unless (eq align a) + (when align + (insert "#+end_") + (insert (symbol-name align)) + (insert "\n")) + (setq align a) + (unless (eq a 'left) + (insert "#+begin_") + (insert (symbol-name align)) + (insert "\n"))))) + (pagebreak + (unless table + (unless (bolp) + (insert "\n")) + (insert " "))) + (link + (insert "[") + (mapc #'rec (cdr x)) + (insert "]")) + (td + (insert "|") + (mapc #'rec (cdr x))) + (tr + (mapc #'rec (cdr x)) + (insert "\n")) + (table + (setq table t) + (mapc #'rec (cdr x)) + (setq table nil + p1 nil + p2 nil) + (org-table-align)) + (p + (mapc #'rec (cdr x)) + (cl-shiftf p1 p2 (cdr x)) + (unless table + (insert "\n"))) + (t (mapc #'rec (cdr x)))))))) + (rec x)))) + +(defun unoffice--from-abw () + (let (z align p) + (cl-labels ((alignment (x) + (let ((props (xml-get-attribute x 'props))) + (when props + (cond + ((cl-search "text-align:left" props) + '(left)) + ((cl-search "text-align:center" props) + '(center)) + ((cl-search "text-align:right" props) + '(right)))))) + (ins (x) + (when (and x (not (equal "" x))) + (push x p))) + (rec (x) (when (consp x) (case (car x) - (p (mapc #'rec (cddr x)) (insert "\n")) - (c (mapc #'insert (cddr x))) + (p + (setq p nil) + (let ((a (alignment x))) + (unless (eq align a) + (push (setq align a) z))) + (mapc #'rec (cddr x)) + (push (cons 'p (nreverse p)) z)) + (c (mapc #'ins (cddr x))) (t (mapc #'rec (cddr x))))))) - (rec (car (xml-parse-file buffer-file-name))))) - (setq buffer-read-only t) - (goto-char (point-min)) - (view-mode)) + (rec (car (xml-parse-file buffer-file-name)))) + (cons 'unoffice (nreverse z)))) -(defun unodt () - (interactive) - (with-silent-modifications - (erase-buffer) +(defun unoffice--from-odt () + (let (z p) (cl-labels ((rec (x) (etypecase x (null) - (string (insert x)) + (string + (unless (equal "" x) + (push x p))) (cons (case (car x) - (text:p (mapc #'rec (cddr x)) (insert "\n")) + (text:p + (setq p nil) + (mapc #'rec (cddr x)) + (push (cons 'p (nreverse p)) z)) (t (mapc #'rec (cddr x)))))))) (rec (let ((f (unoffice--cache-remote buffer-file-name))) (with-temp-buffer (when (zerop (archive-zip-extract f "content.xml")) - (car (xml-parse-region)))))))) - (setq buffer-read-only t) - (goto-char (point-min)) - (view-mode)) + (car (xml-parse-region))))))) + (cons 'unoffice (nreverse z)))) -(defun undocx () - (interactive) - (with-silent-modifications - (erase-buffer) - (insert "# -*- org -*-\n") - (insert "#+STARTUP: showeverything\n") - (insert "\n") - (let (wrote tablep pb rb pi ri pu ru tabs) - (cl-labels ((ins (x) - (when (and x (not (equal "" x))) - (setq wrote t) - (insert x))) - (start-biu () - (unless (eq ru pu) - (setq pu (not pu)) - (ins "_")) - (unless (eq ri pi) - (setq pi (not pi)) - (ins "/")) - (unless (eq rb pb) - (setq pb (not pb)) - (when (bolp) ;; bold vs headings - (ins " ")) - (ins "*"))) - (end-biu () - (when pb - (ins "*")) - (when pi - (ins "/")) - (when pu - (ins "_"))) - (rec (x) - (when (consp x) - (case (car x) - (w:bookmarkStart - (ins "<<") - (ins (xml-get-attribute x 'w:name)) - (ins ">>")) - (w:br - (unless tablep - (when (equal "page" (xml-get-attribute x 'w:type)) - (insert " \n")))) - (w:tc - (insert "|") - (mapc #'rec (cddr x))) - (w:tr - (mapc #'rec (cddr x)) - (insert "\n")) - (w:tbl - (setq tablep t) - (mapc #'rec (cddr x)) - (setq tablep nil) - (org-table-align) - (insert "\n")) - (w:p - (setq wrote nil - pb nil - pi nil - pu nil) +(defun unoffice--from-docx () + (let (z table tbl tr td p pb rb pi ri pu ru tabs) + (cl-labels ((ins (x) + (when (and x (not (equal "" x))) + (push x p))) + (start-biu () + (unless (eq ru pu) + (setq pu (not pu)) + (ins '(u))) + (unless (eq ri pi) + (setq pi (not pi)) + (ins '(i))) + (unless (eq rb pb) + (setq pb (not pb)) + (ins '(b)))) + (end-biu () + (when pb + (ins '(/b))) + (when pi + (ins '(/i))) + (when pu + (ins '(/u)))) + (rec (x) + (when (consp x) + (case (car x) + (w:bookmarkStart + (ins (cons 'anchor (xml-get-attribute x 'w:name)))) + (w:br + (when (equal "page" (xml-get-attribute x 'w:type)) + (ins '(pagebreak)))) + (w:tc + (setq td nil) + (mapc #'rec (cddr x)) + (push (cons 'td (nreverse td)) tr)) + (w:tr + (setq tr nil) + (mapc #'rec (cddr x)) + (push (cons 'tr (nreverse tr)) tbl)) + (w:tbl + (setq table t + tbl nil) + (mapc #'rec (cddr x)) + (setq table nil) + (push (cons 'table (nreverse tbl)) z)) + (w:p + (setq p nil + pb nil + pi nil + pu nil) + (mapc #'rec (cddr x)) + (end-biu) + (if table + (push (cons 'p (nreverse p)) td) + (push (cons 'p (nreverse p)) z))) + (w:r + (setq rb nil + ri nil + ru nil) + (mapc #'rec (cddr x))) + (w:drawing (ins "{picture}")) + (w:hyperlink + (let ((pp p)) + (setq p nil) (mapc #'rec (cddr x)) - (end-biu) - (unless tablep - (when wrote - (fill-paragraph) - (insert "\n\n")))) - (w:r - (setq rb nil - ri nil - ru nil) - (mapc #'rec (cddr x))) - (w:b (setq rb t)) - (w:i (setq ri t)) - (w:u (setq ru t)) - (w:t + (let ((x (cons 'link (nreverse p)))) + (setq p pp) + (ins x)))) + (w:b (setq rb t)) + (w:i (setq ri t)) + (w:u (setq ru t)) + (w:t + (start-biu) + (mapc #'ins (cddr x))) + (w:tabs + (setq tabs t) + (mapc #'rec (cddr x)) + (setq tabs nil)) + (w:tab + (unless tabs (start-biu) - (mapc #'ins (cddr x))) - (w:tabs - (setq tabs t) - (mapc #'rec (cddr x)) - (setq tabs nil)) - (w:tab - (unless tabs - (start-biu) - (ins "\t"))) - (t (mapc #'rec (cddr x))))))) - (rec - (let ((f (unoffice--cache-remote buffer-file-name))) - (with-temp-buffer - (when (or (zerop (archive-zip-extract f "word/document.xml")) - (zerop (archive-zip-extract f "document.xml"))) - (car (xml-parse-region))))))))) + (ins "\t"))) + (t (mapc #'rec (cddr x))))))) + (rec + (let ((f (unoffice--cache-remote buffer-file-name))) + (with-temp-buffer + (when (or (zerop (archive-zip-extract f "word/document.xml")) + (zerop (archive-zip-extract f "document.xml"))) + (car (xml-parse-region))))))) + (cons 'unoffice (nreverse z)))) + +(defun unoffice--as (thunk) + (with-silent-modifications + (erase-buffer) + (funcall thunk)) (setq buffer-read-only t) (goto-char (point-min)) - (org-mode) (view-mode)) +(defalias 'unabw 'unoffice-as-txt + "Deprecated. Use `unoffice-as-txt' or `unoffice-as-org' instead.") + +(defun unoffice-abw-as-txt () + (interactive) + (unoffice--as (lambda () (unoffice--to-txt (unoffice--from-abw))))) + +(defun unoffice-abw-as-org () + (interactive) + (unoffice--as (lambda () (unoffice--to-org (unoffice--from-abw))))) + +(defalias 'unodt 'unoffice-as-txt + "Deprecated. Use `unoffice-as-txt' or `unoffice-as-org' instead.") + +(defun unoffice-odt-as-txt () + (interactive) + (unoffice--as (lambda () (unoffice--to-txt (unoffice--from-odt))))) + +(defun unoffice-odt-as-org () + (interactive) + (unoffice--as (lambda () (unoffice--to-org (unoffice--from-odt))))) + +(defalias 'undocx 'unoffice-as-org + "Deprecated. Use `unoffice-as-txt' or `unoffice-as-org' instead.") + +(defun unoffice-docx-as-txt () + (interactive) + (unoffice--as (lambda () (unoffice--to-txt (unoffice--from-docx))))) + +(defun unoffice-docx-as-org () + (interactive) + (unoffice--as (lambda () (unoffice--to-org (unoffice--from-docx))))) + +(defun unoffice-as-txt () + (interactive) + (let ((x (file-name-extension buffer-file-name))) + (cond + ((equal x "abw") (unoffice-abw-as-txt)) + ((equal x "odt") (unoffice-odt-as-txt)) + ((equal x "docx") (unoffice-docx-as-txt))))) + +(defun unoffice-as-org () + (interactive) + (let ((x (file-name-extension buffer-file-name))) + (cond + ((equal x "abw") (unoffice-abw-as-org)) + ((equal x "odt") (unoffice-odt-as-org)) + ((equal x "docx") (unoffice-docx-as-org))))) + (provide 'unoffice)