emacs-unoffice.el (15416B)
1 ;;; -*- lexical-binding: t -*- 2 ;;; 3 ;;; emacs-unoffice.el 4 ;;; 5 ;;; Reclaim text from office documents (abw, odt, docx). 6 ;;; 7 ;;; Copyright (C) 2020--2021 Tomas Hlavaty <tom at logand dot com> 8 ;;; 9 ;;; License: GPLv3 or later 10 ;;; 11 ;;; Download: git clone https://logand.com/git/emacs-unoffice.git 12 ;;; 13 ;;; News: https://logand.com/sw/emacs-unoffice/atom.xml 14 ;;; 15 ;;; Example configuration: 16 ;;; 17 ;;; (add-to-list 'load-path "~/git/emacs-unoffice") 18 ;;; (require 'emacs-unoffice) 19 ;;; 20 ;;; and associate file extensions 21 ;;; 22 ;;; (add-to-list 'auto-mode-alist '("\\.abw\\'" . unoffice-as-txt)) 23 ;;; (add-to-list 'auto-mode-alist '("\\.docx\\'" . unoffice-as-txt)) 24 ;;; (add-to-list 'auto-mode-alist '("\\.odt\\'" . unoffice-as-txt)) 25 ;;; 26 ;;; or 27 ;;; 28 ;;; (add-to-list 'auto-mode-alist '("\\.abw\\'" . unoffice-as-org)) 29 ;;; (add-to-list 'auto-mode-alist '("\\.docx\\'" . unoffice-as-org)) 30 ;;; (add-to-list 'auto-mode-alist '("\\.odt\\'" . unoffice-as-org)) 31 ;;; 32 ;;; Dependencies: 33 ;;; 34 ;;; - sha256sum from coreutils 35 36 (require 'arc-mode) 37 (require 'cl-lib) 38 (require 'org) 39 (require 'org-table) 40 (require 'view) 41 (require 'xml) 42 43 (defcustom unoffice-cache-directory nil 44 "Specify the directory where to store cache files." 45 :type 'string 46 :group 'unoffice) 47 48 (defun unoffice--local-cache-directory () 49 (let ((z (or unoffice-cache-directory temporary-file-directory))) 50 (if (file-remote-p z) 51 (error "unoffice-cache-directory is remote %s" z) 52 z))) 53 54 (defun unoffice--local-cache-file (name extension) 55 (concat (unoffice--local-cache-directory) "/" name "." extension)) 56 57 (defun unoffice--file-hash (file) 58 (with-temp-buffer 59 (if (file-remote-p file) 60 (process-file "sha256sum" file t) 61 (call-process "sha256sum" file t)) 62 (buffer-substring (point-min) (+ (point-min) 64)))) 63 64 (defun unoffice--cache-remote (file) 65 (if (file-remote-p file) 66 (let ((z (unoffice--local-cache-file (unoffice--file-hash file) 67 (file-name-extension file)))) 68 (unless (file-readable-p z) 69 (copy-file file z)) 70 z) 71 file)) 72 73 (defun unoffice--to-txt (x) 74 (cl-assert (eq 'unoffice (car x))) 75 (outline-mode) 76 (insert "# -*- outline -*-\n") 77 (insert "\n") 78 (let (table) 79 (cl-labels ((ins (x) 80 (when (and x (not (equal "" x))) 81 (insert x))) 82 (fill-previous (x) 83 (unless t ;;table 84 (when (cdr x) 85 (save-excursion 86 (backward-paragraph) 87 (when (bolp) 88 (forward-char)) 89 (end-of-line) 90 (message "%s" (current-column)) 91 (when (< fill-column (current-column)) 92 (fill-paragraph)))))) 93 (rec (x) 94 (cl-typecase x 95 (string (ins x)) 96 (cons 97 (cl-case (car x) 98 (pagebreak 99 (unless table 100 (unless (bolp) 101 (insert "\n")) 102 (insert ""))) 103 (td 104 (insert "|") 105 (mapc #'rec (cdr x))) 106 (tr 107 (mapc #'rec (cdr x)) 108 (insert "\n")) 109 (table 110 (fill-previous x) 111 (setq table t) 112 (mapc #'rec (cdr x)) 113 (setq table nil) 114 (org-table-align)) 115 ((h1 h2 h3 h4 h5 h6 p) 116 (cl-case (car x) 117 (h1 (insert "* ")) 118 (h2 (insert "** ")) 119 (h3 (insert "*** ")) 120 (h4 (insert "**** ")) 121 (h5 (insert "***** ")) 122 (h6 (insert "****** "))) 123 (fill-previous x) 124 (mapc #'rec (cdr x)) 125 (unless table 126 (insert "\n"))) 127 (t (mapc #'rec (cdr x)))))))) 128 (rec x)))) 129 130 (defun unoffice--to-org (x) 131 (cl-assert (eq 'unoffice (car x))) 132 (org-mode) 133 (insert "# -*- org -*-\n") 134 (insert "#+STARTUP: showeverything\n") 135 (insert "\n") 136 (let (table p1 p2 align) 137 (cl-labels ((ins (x) 138 (when (and x (not (equal "" x))) 139 (insert x))) 140 (rec (x) 141 (cl-typecase x 142 (string (ins x)) 143 (cons 144 (cl-case (car x) 145 (b 146 (when (bolp) ;; heading vs bold 147 (insert " ")) 148 (insert "*")) 149 (/b (insert "*")) 150 (i (insert "/")) 151 (/i (insert "/")) 152 (u (insert "_")) 153 (/u (insert "_")) 154 ((left center right) 155 (let ((a (car x))) 156 (unless (eq align a) 157 (when align 158 (insert "#+end_") 159 (insert (symbol-name align)) 160 (insert "\n")) 161 (setq align a) 162 (unless (eq a 'left) 163 (insert "#+begin_") 164 (insert (symbol-name align)) 165 (insert "\n"))))) 166 (pagebreak 167 (unless table 168 (unless (bolp) 169 (insert "\n")) 170 (insert ""))) 171 (link 172 (insert "[") 173 (mapc #'rec (cdr x)) 174 (insert "]")) 175 (td 176 (insert "|") 177 (mapc #'rec (cdr x))) 178 (tr 179 (mapc #'rec (cdr x)) 180 (insert "\n")) 181 (table 182 (setq table t) 183 (mapc #'rec (cdr x)) 184 (setq table nil 185 p1 nil 186 p2 nil) 187 (org-table-align)) 188 ((h1 h2 h3 h4 h5 h6 p) 189 (cl-case (car x) 190 (h1 (insert "* ")) 191 (h2 (insert "** ")) 192 (h3 (insert "*** ")) 193 (h4 (insert "**** ")) 194 (h5 (insert "***** ")) 195 (h6 (insert "****** "))) 196 (mapc #'rec (cdr x)) 197 (cl-shiftf p1 p2 (cdr x)) 198 (unless table 199 (insert "\n"))) 200 (t (mapc #'rec (cdr x)))))))) 201 (rec x)) 202 (when (and align (not (eq align 'left))) 203 (insert "#+end_") 204 (insert (symbol-name align)) 205 (insert "\n")))) 206 207 (defun unoffice--from-abw () 208 (let* (z (left '(left)) (align left) pp p) 209 (cl-labels ((alignment (x) 210 (let ((props (xml-get-attribute x 'props))) 211 (when props 212 (cond 213 ((cl-search "text-align:left" props) 214 left) 215 ((cl-search "text-align:center" props) 216 '(center)) 217 ((cl-search "text-align:right" props) 218 '(right)))))) 219 (ins (x) 220 (when (and x pp (not (equal "" x))) 221 (push x p))) 222 (rec (x) 223 (cl-typecase x 224 (string (ins x)) 225 (cons 226 (cl-case (car x) 227 (p 228 (setq pp t 229 p nil) 230 (let ((a (alignment x))) 231 (unless (eq align a) 232 (setq align a) 233 (when align 234 (push align z)))) 235 (mapc #'rec (cddr x)) 236 (push (cons 237 (let ((style (xml-get-attribute x 'style))) 238 (cond 239 ((equal style "Heading 1") 'h1) 240 ((equal style "Heading 2") 'h2) 241 ((equal style "Heading 3") 'h3) 242 ((equal style "Heading 4") 'h4) 243 ((equal style "Heading 5") 'h5) 244 ((equal style "Heading 6") 'h6) 245 (t 'p))) 246 (nreverse p)) 247 z) 248 (setq pp nil)) 249 (c (mapc #'rec (cddr x))) 250 (t (mapc #'rec (cddr x)))))))) 251 (rec (car (xml-parse-file buffer-file-name)))) 252 (cons 'unoffice (nreverse z)))) 253 254 (defun unoffice--from-odt () 255 (let (z p) 256 (cl-labels ((rec (x) 257 (cl-etypecase x 258 (null) 259 (string 260 (unless (equal "" x) 261 (push x p))) 262 (cons 263 (cl-case (car x) 264 (text:p 265 (setq p nil) 266 (mapc #'rec (cddr x)) 267 (push (cons 'p (nreverse p)) z)) 268 (t (mapc #'rec (cddr x)))))))) 269 (rec 270 (let ((f (unoffice--cache-remote buffer-file-name))) 271 (with-temp-buffer 272 (when (zerop (archive-zip-extract f "content.xml")) 273 (car (xml-parse-region))))))) 274 (cons 'unoffice (nreverse z)))) 275 276 (defun unoffice--from-docx () 277 (let (z table tbl tr td p pb rb pi ri pu ru tabs) 278 (cl-labels ((ins (x) 279 (when (and x (not (equal "" x))) 280 (push x p))) 281 (start-biu () 282 (unless (eq ru pu) 283 (setq pu (not pu)) 284 (ins '(u))) 285 (unless (eq ri pi) 286 (setq pi (not pi)) 287 (ins '(i))) 288 (unless (eq rb pb) 289 (setq pb (not pb)) 290 (ins '(b)))) 291 (end-biu () 292 (when pb 293 (ins '(/b))) 294 (when pi 295 (ins '(/i))) 296 (when pu 297 (ins '(/u)))) 298 (rec (x) 299 (when (consp x) 300 (cl-case (car x) 301 (w:bookmarkStart 302 (ins (cons 'anchor (xml-get-attribute x 'w:name)))) 303 (w:br 304 (when (equal "page" (xml-get-attribute x 'w:type)) 305 (ins '(pagebreak)))) 306 (w:tc 307 (setq td nil) 308 (mapc #'rec (cddr x)) 309 (push (cons 'td (nreverse td)) tr)) 310 (w:tr 311 (setq tr nil) 312 (mapc #'rec (cddr x)) 313 (push (cons 'tr (nreverse tr)) tbl)) 314 (w:tbl 315 (setq table t 316 tbl nil) 317 (mapc #'rec (cddr x)) 318 (setq table nil) 319 (push (cons 'table (nreverse tbl)) z)) 320 (w:p 321 (setq p nil 322 pb nil 323 pi nil 324 pu nil) 325 (mapc #'rec (cddr x)) 326 (end-biu) 327 (if table 328 (push (cons 'p (nreverse p)) td) 329 (push (cons 'p (nreverse p)) z))) 330 (w:r 331 (setq rb nil 332 ri nil 333 ru nil) 334 (mapc #'rec (cddr x))) 335 (w:drawing (ins "{picture}")) 336 (w:hyperlink 337 (let ((pp p)) 338 (setq p nil) 339 (mapc #'rec (cddr x)) 340 (let ((x (cons 'link (nreverse p)))) 341 (setq p pp) 342 (ins x)))) 343 (w:b (setq rb t)) 344 (w:i (setq ri t)) 345 (w:u (setq ru t)) 346 (w:t 347 (start-biu) 348 (mapc #'ins (cddr x))) 349 (w:tabs 350 (setq tabs t) 351 (mapc #'rec (cddr x)) 352 (setq tabs nil)) 353 (w:tab 354 (unless tabs 355 (start-biu) 356 (ins "\t"))) 357 (t (mapc #'rec (cddr x))))))) 358 (rec 359 (let ((f (unoffice--cache-remote buffer-file-name))) 360 (with-temp-buffer 361 (when (or (zerop (archive-zip-extract f "word/document.xml")) 362 (zerop (archive-zip-extract f "document.xml"))) 363 (car (xml-parse-region))))))) 364 (cons 'unoffice (nreverse z)))) 365 366 (defun unoffice--as (thunk) 367 (with-silent-modifications 368 (erase-buffer) 369 (funcall thunk)) 370 (setq buffer-read-only t) 371 (goto-char (point-min)) 372 (view-mode)) 373 374 (defalias 'unabw 'unoffice-as-txt 375 "Deprecated. Use `unoffice-as-txt' or `unoffice-as-org' instead.") 376 377 (defun unoffice-abw-as-txt () 378 (interactive) 379 (unoffice--as (lambda () (unoffice--to-txt (unoffice--from-abw))))) 380 381 (defun unoffice-abw-as-org () 382 (interactive) 383 (unoffice--as (lambda () (unoffice--to-org (unoffice--from-abw))))) 384 385 (defalias 'unodt 'unoffice-as-txt 386 "Deprecated. Use `unoffice-as-txt' or `unoffice-as-org' instead.") 387 388 (defun unoffice-odt-as-txt () 389 (interactive) 390 (unoffice--as (lambda () (unoffice--to-txt (unoffice--from-odt))))) 391 392 (defun unoffice-odt-as-org () 393 (interactive) 394 (unoffice--as (lambda () (unoffice--to-org (unoffice--from-odt))))) 395 396 (defalias 'undocx 'unoffice-as-org 397 "Deprecated. Use `unoffice-as-txt' or `unoffice-as-org' instead.") 398 399 (defun unoffice-docx-as-txt () 400 (interactive) 401 (unoffice--as (lambda () (unoffice--to-txt (unoffice--from-docx))))) 402 403 (defun unoffice-docx-as-org () 404 (interactive) 405 (unoffice--as (lambda () (unoffice--to-org (unoffice--from-docx))))) 406 407 (defun unoffice-as-txt () 408 (interactive) 409 (let ((x (file-name-extension buffer-file-name))) 410 (cond 411 ((equal x "abw") (unoffice-abw-as-txt)) 412 ((equal x "odt") (unoffice-odt-as-txt)) 413 ((equal x "docx") (unoffice-docx-as-txt))))) 414 415 (defun unoffice-as-org () 416 (interactive) 417 (let ((x (file-name-extension buffer-file-name))) 418 (cond 419 ((equal x "abw") (unoffice-abw-as-org)) 420 ((equal x "odt") (unoffice-odt-as-org)) 421 ((equal x "docx") (unoffice-docx-as-org))))) 422 423 (provide 'emacs-unoffice)