emacs-unoffice

Emacs library to reclaim text from office documents (abw, odt, docx).
Log | Files | Refs

emacs-unoffice.el (15416B)


      1 ;;; -*- lexical-binding: t -*-
      2 ;;;
      3 ;;; emacs-unoffice.el
      4 ;;;
      5 ;;; Reclaim text from office documents (abw, odt, docx).
      6 ;;;
      7 ;;; Copyright (C) 2020--2021 Tomas Hlavaty <tom at logand dot com>
      8 ;;;
      9 ;;; License: GPLv3 or later
     10 ;;;
     11 ;;; Download: git clone https://logand.com/git/emacs-unoffice.git
     12 ;;;
     13 ;;; News: https://logand.com/sw/emacs-unoffice/atom.xml
     14 ;;;
     15 ;;; Example configuration:
     16 ;;;
     17 ;;; (add-to-list 'load-path "~/git/emacs-unoffice")
     18 ;;; (require 'emacs-unoffice)
     19 ;;;
     20 ;;; and associate file extensions
     21 ;;;
     22 ;;; (add-to-list 'auto-mode-alist '("\\.abw\\'" . unoffice-as-txt))
     23 ;;; (add-to-list 'auto-mode-alist '("\\.docx\\'" . unoffice-as-txt))
     24 ;;; (add-to-list 'auto-mode-alist '("\\.odt\\'" . unoffice-as-txt))
     25 ;;;
     26 ;;; or
     27 ;;;
     28 ;;; (add-to-list 'auto-mode-alist '("\\.abw\\'" . unoffice-as-org))
     29 ;;; (add-to-list 'auto-mode-alist '("\\.docx\\'" . unoffice-as-org))
     30 ;;; (add-to-list 'auto-mode-alist '("\\.odt\\'" . unoffice-as-org))
     31 ;;;
     32 ;;; Dependencies:
     33 ;;;
     34 ;;; - sha256sum from coreutils
     35 
     36 (require 'arc-mode)
     37 (require 'cl-lib)
     38 (require 'org)
     39 (require 'org-table)
     40 (require 'view)
     41 (require 'xml)
     42 
     43 (defcustom unoffice-cache-directory nil
     44   "Specify the directory where to store cache files."
     45   :type 'string
     46   :group 'unoffice)
     47 
     48 (defun unoffice--local-cache-directory ()
     49   (let ((z (or unoffice-cache-directory temporary-file-directory)))
     50     (if (file-remote-p z)
     51 	(error "unoffice-cache-directory is remote %s" z)
     52       z)))
     53 
     54 (defun unoffice--local-cache-file (name extension)
     55   (concat (unoffice--local-cache-directory) "/" name "." extension))
     56 
     57 (defun unoffice--file-hash (file)
     58   (with-temp-buffer
     59     (if (file-remote-p file)
     60   	(process-file "sha256sum" file t)
     61       (call-process "sha256sum" file t))
     62     (buffer-substring (point-min) (+ (point-min) 64))))
     63 
     64 (defun unoffice--cache-remote (file)
     65   (if (file-remote-p file)
     66       (let ((z (unoffice--local-cache-file (unoffice--file-hash file)
     67 					   (file-name-extension file))))
     68 	(unless (file-readable-p z)
     69 	  (copy-file file z))
     70         z)
     71     file))
     72 
     73 (defun unoffice--to-txt (x)
     74   (cl-assert (eq 'unoffice (car x)))
     75   (outline-mode)
     76   (insert "# -*- outline -*-\n")
     77   (insert "\n")
     78   (let (table)
     79     (cl-labels ((ins (x)
     80                      (when (and x (not (equal "" x)))
     81                        (insert x)))
     82                 (fill-previous (x)
     83                       (unless t ;;table
     84                         (when (cdr x)
     85                           (save-excursion
     86                             (backward-paragraph)
     87                             (when (bolp)
     88                               (forward-char))
     89                             (end-of-line)
     90                             (message "%s" (current-column))
     91                             (when (< fill-column (current-column))
     92                               (fill-paragraph))))))
     93                 (rec (x)
     94                      (cl-typecase x
     95                        (string (ins x))
     96                        (cons
     97                         (cl-case (car x)
     98                           (pagebreak
     99                            (unless table
    100                              (unless (bolp)
    101                                (insert "\n"))
    102                              (insert "")))
    103 			  (td
    104 			   (insert "|")
    105 			   (mapc #'rec (cdr x)))
    106 			  (tr
    107 			   (mapc #'rec (cdr x))
    108 			   (insert "\n"))
    109 			  (table
    110                            (fill-previous x)
    111 			   (setq table t)
    112 			   (mapc #'rec (cdr x))
    113 			   (setq table nil)
    114 			   (org-table-align))
    115                           ((h1 h2 h3 h4 h5 h6 p)
    116                            (cl-case (car x)
    117                              (h1 (insert "* "))
    118                              (h2 (insert "** "))
    119                              (h3 (insert "*** "))
    120                              (h4 (insert "**** "))
    121                              (h5 (insert "***** "))
    122                              (h6 (insert "****** ")))
    123                            (fill-previous x)
    124                            (mapc #'rec (cdr x))
    125                            (unless table
    126                              (insert "\n")))
    127                           (t (mapc #'rec (cdr x))))))))
    128       (rec x))))
    129 
    130 (defun unoffice--to-org (x)
    131   (cl-assert (eq 'unoffice (car x)))
    132   (org-mode)
    133   (insert "# -*- org -*-\n")
    134   (insert "#+STARTUP: showeverything\n")
    135   (insert "\n")
    136   (let (table p1 p2 align)
    137     (cl-labels ((ins (x)
    138                      (when (and x (not (equal "" x)))
    139                        (insert x)))
    140                 (rec (x)
    141                      (cl-typecase x
    142                        (string (ins x))
    143                        (cons
    144                         (cl-case (car x)
    145                           (b
    146                            (when (bolp) ;; heading vs bold
    147                              (insert " "))
    148                            (insert "*"))
    149                           (/b (insert "*"))
    150                           (i (insert "/"))
    151                           (/i (insert "/"))
    152                           (u (insert "_"))
    153                           (/u (insert "_"))
    154                           ((left center right)
    155                            (let ((a (car x)))
    156                              (unless (eq align a)
    157                                (when align
    158                                  (insert "#+end_")
    159                                  (insert (symbol-name align))
    160                                  (insert "\n"))
    161                                (setq align a)
    162                                (unless (eq a 'left)
    163                                  (insert "#+begin_")
    164                                  (insert (symbol-name align))
    165                                  (insert "\n")))))
    166                           (pagebreak
    167                            (unless table
    168                              (unless (bolp)
    169                                (insert "\n"))
    170                              (insert "")))
    171 			  (link
    172 			   (insert "[")
    173 			   (mapc #'rec (cdr x))
    174                            (insert "]"))
    175 			  (td
    176 			   (insert "|")
    177 			   (mapc #'rec (cdr x)))
    178 			  (tr
    179 			   (mapc #'rec (cdr x))
    180 			   (insert "\n"))
    181 			  (table
    182 			   (setq table t)
    183 			   (mapc #'rec (cdr x))
    184 			   (setq table nil
    185                                  p1 nil
    186                                  p2 nil)
    187 			   (org-table-align))
    188                           ((h1 h2 h3 h4 h5 h6 p)
    189                            (cl-case (car x)
    190                              (h1 (insert "* "))
    191                              (h2 (insert "** "))
    192                              (h3 (insert "*** "))
    193                              (h4 (insert "**** "))
    194                              (h5 (insert "***** "))
    195                              (h6 (insert "****** ")))
    196                            (mapc #'rec (cdr x))
    197                            (cl-shiftf p1 p2 (cdr x))
    198                            (unless table
    199                              (insert "\n")))
    200                           (t (mapc #'rec (cdr x))))))))
    201       (rec x))
    202     (when (and align (not (eq align 'left)))
    203       (insert "#+end_")
    204       (insert (symbol-name align))
    205       (insert "\n"))))
    206 
    207 (defun unoffice--from-abw ()
    208   (let* (z (left '(left)) (align left) pp p)
    209     (cl-labels ((alignment (x)
    210                            (let ((props (xml-get-attribute x 'props)))
    211                              (when props
    212                                (cond
    213                                 ((cl-search "text-align:left" props)
    214                                  left)
    215                                 ((cl-search "text-align:center" props)
    216                                  '(center))
    217                                 ((cl-search "text-align:right" props)
    218                                  '(right))))))
    219                 (ins (x)
    220                      (when (and x pp (not (equal "" x)))
    221                        (push x p)))
    222                 (rec (x)
    223                      (cl-typecase x
    224                        (string (ins x))
    225                        (cons
    226                         (cl-case (car x)
    227                           (p
    228                            (setq pp t
    229                                  p nil)
    230                            (let ((a (alignment x)))
    231                              (unless (eq align a)
    232                                (setq align a)
    233                                (when align
    234                                  (push align z))))
    235                            (mapc #'rec (cddr x))
    236                            (push (cons
    237                                   (let ((style (xml-get-attribute x 'style)))
    238                                     (cond
    239                                      ((equal style "Heading 1") 'h1)
    240                                      ((equal style "Heading 2") 'h2)
    241                                      ((equal style "Heading 3") 'h3)
    242                                      ((equal style "Heading 4") 'h4)
    243                                      ((equal style "Heading 5") 'h5)
    244                                      ((equal style "Heading 6") 'h6)
    245                                      (t 'p)))
    246                                   (nreverse p))
    247                                  z)
    248                            (setq pp nil))
    249                           (c (mapc #'rec (cddr x)))
    250                           (t (mapc #'rec (cddr x))))))))
    251       (rec (car (xml-parse-file buffer-file-name))))
    252     (cons 'unoffice (nreverse z))))
    253 
    254 (defun unoffice--from-odt ()
    255   (let (z p)
    256     (cl-labels ((rec (x)
    257                      (cl-etypecase x
    258                        (null)
    259                        (string
    260                         (unless (equal "" x)
    261                           (push x p)))
    262                        (cons
    263                         (cl-case (car x)
    264                           (text:p
    265                            (setq p nil)
    266                            (mapc #'rec (cddr x))
    267                            (push (cons 'p (nreverse p)) z))
    268                           (t (mapc #'rec (cddr x))))))))
    269       (rec
    270        (let ((f (unoffice--cache-remote buffer-file-name)))
    271          (with-temp-buffer
    272            (when (zerop (archive-zip-extract f "content.xml"))
    273              (car (xml-parse-region)))))))
    274     (cons 'unoffice (nreverse z))))
    275 
    276 (defun unoffice--from-docx ()
    277   (let (z table tbl tr td p pb rb pi ri pu ru tabs)
    278     (cl-labels ((ins (x)
    279                      (when (and x (not (equal "" x)))
    280                        (push x p)))
    281                 (start-biu ()
    282                            (unless (eq ru pu)
    283                              (setq pu (not pu))
    284                              (ins '(u)))
    285                            (unless (eq ri pi)
    286                              (setq pi (not pi))
    287                              (ins '(i)))
    288                            (unless (eq rb pb)
    289                              (setq pb (not pb))
    290                              (ins '(b))))
    291                 (end-biu ()
    292                          (when pb
    293                            (ins '(/b)))
    294                          (when pi
    295                            (ins '(/i)))
    296                          (when pu
    297                            (ins '(/u))))
    298                 (rec (x)
    299                      (when (consp x)
    300                        (cl-case (car x)
    301                          (w:bookmarkStart
    302                           (ins (cons 'anchor (xml-get-attribute x 'w:name))))
    303                          (w:br
    304                           (when (equal "page" (xml-get-attribute x 'w:type))
    305                             (ins '(pagebreak))))
    306 			 (w:tc
    307                           (setq td nil)
    308                           (mapc #'rec (cddr x))
    309                           (push (cons 'td (nreverse td)) tr))
    310 			 (w:tr
    311                           (setq tr nil)
    312                           (mapc #'rec (cddr x))
    313                           (push (cons 'tr (nreverse tr)) tbl))
    314 			 (w:tbl
    315                           (setq table t
    316                                 tbl nil)
    317                           (mapc #'rec (cddr x))
    318                           (setq table nil)
    319                           (push (cons 'table (nreverse tbl)) z))
    320                          (w:p
    321                           (setq p nil
    322                                 pb nil
    323                                 pi nil
    324                                 pu nil)
    325                           (mapc #'rec (cddr x))
    326                           (end-biu)
    327                           (if table
    328                               (push (cons 'p (nreverse p)) td)
    329                             (push (cons 'p (nreverse p)) z)))
    330                          (w:r
    331 			  (setq rb nil
    332                                 ri nil
    333                                 ru nil)
    334 			  (mapc #'rec (cddr x)))
    335                          (w:drawing (ins "{picture}"))
    336                          (w:hyperlink
    337                           (let ((pp p))
    338                             (setq p nil)
    339                             (mapc #'rec (cddr x))
    340                             (let ((x (cons 'link (nreverse p))))
    341                               (setq p pp)
    342                               (ins x))))
    343                          (w:b (setq rb t))
    344                          (w:i (setq ri t))
    345                          (w:u (setq ru t))
    346                          (w:t
    347                           (start-biu)
    348                           (mapc #'ins (cddr x)))
    349                          (w:tabs
    350                           (setq tabs t)
    351                           (mapc #'rec (cddr x))
    352                           (setq tabs nil))
    353                          (w:tab
    354                           (unless tabs
    355                             (start-biu)
    356                             (ins "\t")))
    357                          (t (mapc #'rec (cddr x)))))))
    358       (rec
    359        (let ((f (unoffice--cache-remote buffer-file-name)))
    360          (with-temp-buffer
    361            (when (or (zerop (archive-zip-extract f "word/document.xml"))
    362                      (zerop (archive-zip-extract f "document.xml")))
    363              (car (xml-parse-region)))))))
    364     (cons 'unoffice (nreverse z))))
    365 
    366 (defun unoffice--as (thunk)
    367   (with-silent-modifications
    368     (erase-buffer)
    369     (funcall thunk))
    370   (setq buffer-read-only t)
    371   (goto-char (point-min))
    372   (view-mode))
    373 
    374 (defalias 'unabw 'unoffice-as-txt
    375   "Deprecated.  Use `unoffice-as-txt' or `unoffice-as-org' instead.")
    376 
    377 (defun unoffice-abw-as-txt ()
    378   (interactive)
    379   (unoffice--as (lambda () (unoffice--to-txt (unoffice--from-abw)))))
    380 
    381 (defun unoffice-abw-as-org ()
    382   (interactive)
    383   (unoffice--as (lambda () (unoffice--to-org (unoffice--from-abw)))))
    384 
    385 (defalias 'unodt 'unoffice-as-txt
    386   "Deprecated.  Use `unoffice-as-txt' or `unoffice-as-org' instead.")
    387 
    388 (defun unoffice-odt-as-txt ()
    389   (interactive)
    390   (unoffice--as (lambda () (unoffice--to-txt (unoffice--from-odt)))))
    391 
    392 (defun unoffice-odt-as-org ()
    393   (interactive)
    394   (unoffice--as (lambda () (unoffice--to-org (unoffice--from-odt)))))
    395 
    396 (defalias 'undocx 'unoffice-as-org
    397   "Deprecated.  Use `unoffice-as-txt' or `unoffice-as-org' instead.")
    398 
    399 (defun unoffice-docx-as-txt ()
    400   (interactive)
    401   (unoffice--as (lambda () (unoffice--to-txt (unoffice--from-docx)))))
    402 
    403 (defun unoffice-docx-as-org ()
    404   (interactive)
    405   (unoffice--as (lambda () (unoffice--to-org (unoffice--from-docx)))))
    406 
    407 (defun unoffice-as-txt ()
    408   (interactive)
    409   (let ((x (file-name-extension buffer-file-name)))
    410     (cond
    411      ((equal x "abw") (unoffice-abw-as-txt))
    412      ((equal x "odt") (unoffice-odt-as-txt))
    413      ((equal x "docx") (unoffice-docx-as-txt)))))
    414 
    415 (defun unoffice-as-org ()
    416   (interactive)
    417   (let ((x (file-name-extension buffer-file-name)))
    418     (cond
    419      ((equal x "abw") (unoffice-abw-as-org))
    420      ((equal x "odt") (unoffice-odt-as-org))
    421      ((equal x "docx") (unoffice-docx-as-org)))))
    422 
    423 (provide 'emacs-unoffice)