unoffice

Reclaim text from office documents
git clone https://logand.com/git/unoffice.git/
Log | Files | Refs | README

commit 81f916b9299dd8434e765fc230d0e177cae72bdd
parent f369fa3eefe82cdc67485cb430d67e5f7e48b505
Author: Tomas Hlavaty <tom@logand.com>
Date:   Tue,  2 Jul 2019 07:37:08 +0200

first try

Diffstat:
Aundocx | 13+++++++++++++
Aunodt | 12++++++++++++
2 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/undocx b/undocx @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail +unzip -p "$1" \ + | grep -a '<w:r' \ + | sed 's/<w:p[^<\/]*>/\n/g' \ + | sed 's/<[^<]*>//g' \ + | sed 's///g' \ + | sed 's/&lt;/</g' \ + | sed 's/&gt;/>/g' \ + | sed "s/&apos;/'/g" \ + | sed 's/&quot;/"/g' \ + | sed 's/&amp;/&/g' \ + | cat -s diff --git a/unodt b/unodt @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +#set -euo pipefail +unzip -p "$1" \ + | grep -a '<text:p' \ + | sed 's/<text:p[^<\/]*>/\n/g' \ + | sed 's/<[^<]*>//g' \ + | sed 's/&lt;/</g' \ + | sed 's/&gt;/>/g' \ + | sed "s/&apos;/'/g" \ + | sed 's/&quot;/"/g' \ + | sed 's/&amp;/&/g' \ + | cat -s