olefs

command line tools to extract data from OLE documents like doc, ppt, xls, msg
git clone https://logand.com/git/olefs.git/
Log | Files | Refs

commit 9a3b0a6df06fd40591986a6b1bcd5acf093bfb06
parent a9018212f9f623c9e1cf5b43b85b792894e7b193
Author: Tomas Hlavaty <tom@logand.com>
Date:   Sun, 14 Jul 2019 23:35:30 +0200

utf8, unppt and ppt2html

Diffstat:
Mcfb.c | 7++-----
Mdefault.nix | 2+-
Mppt.c | 72++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Appt2html | 17+++++++++++++++++
Aunppt | 3+++
Dutf8.c | 151------------------------------------------------------------------------------
6 files changed, 71 insertions(+), 181 deletions(-)

diff --git a/cfb.c b/cfb.c @@ -1,4 +1,5 @@ // TODO version 4 with bigger sector size +// TODO from pipe without seek const char *VERSION = #include "VERSION" @@ -12,7 +13,6 @@ const char *VERSION = #include <fcntl.h> #include <unistd.h> #include <stdint.h> -//#include <iconv.h> // TODO wchar -> utf8 properly // MS-CFB Compound File Binary File Format @@ -294,7 +294,6 @@ static void open_cfb_file(char *filename, struct cfb_file *x) { fprintf(stderr, "Unable to open '%s'.\n", filename); exit(1); } - //conv = iconv_open("UTF-8", "UTF-16LE"); //"UCS-2"); //"UCS2-LE"); x->stream = stream; read_header(stream, &x->header); check_header(&x->header); @@ -388,9 +387,7 @@ static void cat(struct entry *e, struct cfb_file *f) { } } -static size_t xconv(wchar *iname, char *oname, size_t length) { - /* size_t ileft = length, oleft; */ - /* return iconv(conv, (char **) &iname, &ileft, &oname, &oleft); */ +static size_t xconv(wchar *iname, char *oname, size_t length) { // TODO utf8 int i; for(i = 0; i < length / sizeof(wchar); i++) oname[i] = iname[i]; diff --git a/default.nix b/default.nix @@ -6,7 +6,7 @@ stdenv.mkDerivation rec { src = ./.; installPhase = '' mkdir -p $out/bin - cp cfb ppt $out/bin + cp cfb ppt unppt ppt2html $out/bin ''; meta = { license = stdenv.lib.licenses.gpl3Plus; diff --git a/ppt.c b/ppt.c @@ -1,4 +1,6 @@ // TODO proper little endian read/write +// TODO pic in place instead of appended +// TODO limit mem like timeout const char *VERSION = #include "VERSION" @@ -140,36 +142,45 @@ static void cat(struct in *in, FILE *out, dword n) { } } +static void utf8txt(uint16_t c) { + if(c <= 0) return; + if(c < 0x80) { // 0xxxxxxx + switch(c) { + case '\r': puts(""); break; + default: putchar(c); + } + } else if(c < 0x800) { // 110xxxxx 10xxxxxx + putchar(0xc0 | (c >> 6)); + putchar(0x80 | (0x3f & c)); + } else if(c < 0x10000) { // 1110xxxx 10xxxxxx 10xxxxxx + putchar(0xe0 | (c >> 12)); + putchar(0x80 | (0x3f & (c >> 6))); + putchar(0x80 | (0x3f & c)); + } +} + static void txt(struct RecordHeader *h, struct in *in) { switch(h->recType) { case 0x0fa0: // RT_TextCharsAtom utf16le case 0x0fba: // RT_CString for(int i = 0; i < h->recLen; i += 2) { - short c; + uint16_t c; if(1 != in_read(&c, 2, 1, in)) { fprintf(stderr, "unexpected end of file\n"); exit(1); } - if(0 < c && c < 0x80) { - switch(c) { - case 0x0d: puts(""); break; - default: putchar(c); // TODO whole utf - } - } + utf8txt(c); } puts(""); break; case 0x0fa8: // RT_TextBytesAtom ascii for(int i = 0; i < h->recLen; i++) { - char c; + uint8_t c; if(1 != in_read(&c, 1, 1, in)) { fprintf(stderr, "unexpected end of file\n"); exit(1); } - switch(c) { - case 0x0d: puts(""); break; - default: putchar(c); // TODO whole utf - } + utf8txt(c); } puts(""); break; @@ -183,38 +194,51 @@ static void txt(struct RecordHeader *h, struct in *in) { } } +static void utf8html(uint16_t c) { + if(c <= 0) return; + if(c < 0x80) { // 0xxxxxxx + switch(c) { + case '&': printf("&amp;"); break; + case '<': printf("&lt;"); break; + case '>': printf("&gt;"); break; + case '\'': printf("&quot;"); break; + case '\r': puts("<br>"); break; + default: putchar(c); + } + } else if(c < 0x800) { // 110xxxxx 10xxxxxx + putchar(0xc0 | (c >> 6)); + putchar(0x80 | (0x3f & c)); + } else if(c < 0x10000) { // 1110xxxx 10xxxxxx 10xxxxxx + putchar(0xe0 | (c >> 12)); + putchar(0x80 | (0x3f & (c >> 6))); + putchar(0x80 | (0x3f & c)); + } +} + static void html(struct RecordHeader *h, struct in *in) { switch(h->recType) { case 0x0fa0: // RT_TextCharsAtom utf16le case 0x0fba: // RT_CString printf("<p>"); for(int i = 0; i < h->recLen; i += 2) { - short c; + uint16_t c; if(1 != in_read(&c, 2, 1, in)) { fprintf(stderr, "unexpected end of file\n"); exit(1); } - if(0 < c && c < 0x80) { - switch(c) { - case 0x0d: puts("<br>"); break; - default: putchar(c); // TODO whole utf - } - } + utf8html(c); } puts("</p>"); break; case 0x0fa8: // RT_TextBytesAtom ascii printf("<p>"); for(int i = 0; i < h->recLen; i++) { - char c; + uint8_t c; if(1 != in_read(&c, 1, 1, in)) { fprintf(stderr, "unexpected end of file\n"); exit(1); } - switch(c) { - case 0x0d: puts("<br>"); break; - default: putchar(c); // TODO whole utf - } + utf8html(c); } puts("</p>"); break; diff --git a/ppt2html b/ppt2html @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail +d=$(mktemp -d -q) +( + cd $d + cfb cat "$1" '/Root Entry/PowerPoint Document' >.doc + ppt html .doc >index.html + rm .doc + cfb cat "$1" '/Root Entry/Pictures' >.pic + ppt extract .pic + rm .pic + ls * \ + | grep -v index.html \ + | sort -n \ + | xargs -n 1 -I{} echo '<img src="{}">' >>index.html + echo $d +) diff --git a/unppt b/unppt @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +set -euo pipefail +cfb cat "$1" '/Root Entry/PowerPoint Document' | ppt txt diff --git a/utf8.c b/utf8.c @@ -1,151 +0,0 @@ -/* - pptHtml - Format a PowerPoint Presentation into Html - Copyright 2002 Charles N Wyble <jackshck@yahoo.com> - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include "utf8.h" - -#include "stdio.h" - -void OutputCharCorrected(unsigned char c) -{ - switch (c) - { /* Special char handlers here... */ - case '\r': - printf("<BR>\n"); - break; - case 0x3C: - printf("&lt;"); - break; - case 0x3E: - printf("&gt;"); - break; - case 0x26: - printf("&amp;"); - break; - case 0x22: - printf("&quot;"); - break; - /* Also need to cover 128-159 since MS uses this area... */ - case 0x80: /* Euro Symbol */ - printf("&#8364;"); - break; - case 0x82: /* baseline single quote */ - printf("&#8218;"); - break; - case 0x83: /* florin */ - printf("&#402;"); - break; - case 0x84: /* baseline double quote */ - printf("&#8222;"); - break; - case 0x85: /* ellipsis */ - printf("&#8230;"); - break; - case 0x86: /* dagger */ - printf("&#8224;"); - break; - case 0x87: /* double dagger */ - printf("&#8225;"); - break; - case 0x88: /* circumflex accent */ - printf("&#710;"); - break; - case 0x89: /* permile */ - printf("&#8240;"); - break; - case 0x8A: /* S Hacek */ - printf("&#352;"); - break; - case 0x8B: /* left single guillemet */ - printf("&#8249;"); - break; - case 0x8C: /* OE ligature */ - printf("&#338;"); - break; - case 0x8E: /* #LATIN CAPITAL LETTER Z WITH CARON */ - printf("&#381;"); - break; - case 0x91: /* left single quote ? */ - printf("&#8216;"); - break; - case 0x92: /* right single quote ? */ - printf("&#8217;"); - break; - case 0x93: /* left double quote */ - printf("&#8220;"); - break; - case 0x94: /* right double quote */ - printf("&#8221;"); - break; - case 0x95: /* bullet */ - printf("&#8226;"); - break; - case 0x96: /* endash */ - printf("&#8211;"); - break; - case 0x97: /* emdash */ - printf("&#8212;"); - break; - case 0x98: /* tilde accent */ - printf("&#732;"); - break; - case 0x99: /* trademark ligature */ - printf("&#8482;"); - break; - case 0x9A: /* s Haceks Hacek */ - printf("&#353;"); - break; - case 0x9B: /* right single guillemet */ - printf("&#8250;"); - break; - case 0x9C: /* oe ligature */ - printf("&#339;"); - break; - case 0x9F: /* Y Dieresis */ - printf("&#376;"); - break; - default: - putchar(c); - break; - } -} - -void print_utf8(unsigned short c) -{ - if (c == 0) - return; - - if (c < 0x80) - OutputCharCorrected(c); - else if (c < 0x800) - { - putchar(0xC0 | (c >> 6)); - put_utf8(c); - } - else - { - putchar(0xE0 | (c >> 12)); - put_utf8(c >> 6); - put_utf8(c); - } -} - -void put_utf8(unsigned short c) -{ - putchar(0x0080 | ((short)c & 0x003F)); -}