utf8, unppt and ppt2html - olefs - command line tools to extract data from OLE documents like doc, ppt, xls, msg

commit 9a3b0a6df06fd40591986a6b1bcd5acf093bfb06
parent a9018212f9f623c9e1cf5b43b85b792894e7b193
Author: Tomas Hlavaty <tom@logand.com>
Date:   Sun, 14 Jul 2019 23:35:30 +0200

utf8, unppt and ppt2html

Diffstat:
M cfb.c  | 7 ++-----
M default.nix  | 2 +-
M ppt.c  | 72 ++++++++++++++++++++++++++++++++++++++++++++++++------------------------
A ppt2html  | 17 +++++++++++++++++
A unppt  | 3 +++
D utf8.c  | 151 ------------------------------------------------------------------------------

6 files changed, 71 insertions(+), 181 deletions(-)
diff --git a/cfb.c b/cfb.c
@@ -1,4 +1,5 @@
 // TODO version 4 with bigger sector size
+// TODO from pipe without seek
 
 const char *VERSION =
 #include "VERSION"
@@ -12,7 +13,6 @@ const char *VERSION =
 #include <fcntl.h>
 #include <unistd.h>
 #include <stdint.h>
-//#include <iconv.h> // TODO wchar -> utf8 properly
 
 // MS-CFB Compound File Binary File Format
 
@@ -294,7 +294,6 @@ static void open_cfb_file(char *filename, struct cfb_file *x) {
     fprintf(stderr, "Unable to open '%s'.\n", filename);
     exit(1);
   }
-  //conv = iconv_open("UTF-8", "UTF-16LE"); //"UCS-2"); //"UCS2-LE");
   x->stream = stream;
   read_header(stream, &x->header);
   check_header(&x->header);
@@ -388,9 +387,7 @@ static void cat(struct entry *e, struct cfb_file *f) {
   }
 }
 
-static size_t xconv(wchar *iname, char *oname, size_t length) {
-  /* size_t ileft = length, oleft; */
-  /* return iconv(conv, (char **) &iname, &ileft, &oname, &oleft); */
+static size_t xconv(wchar *iname, char *oname, size_t length) { // TODO utf8
   int i;
   for(i = 0; i < length / sizeof(wchar); i++)
     oname[i] = iname[i];
diff --git a/default.nix b/default.nix
@@ -6,7 +6,7 @@ stdenv.mkDerivation rec {
   src = ./.;
   installPhase = ''
     mkdir -p $out/bin
-    cp cfb ppt $out/bin
+    cp cfb ppt unppt ppt2html $out/bin
   '';
   meta = {
     license = stdenv.lib.licenses.gpl3Plus;
diff --git a/ppt.c b/ppt.c
@@ -1,4 +1,6 @@
 // TODO proper little endian read/write
+// TODO pic in place instead of appended
+// TODO limit mem like timeout
 
 const char *VERSION =
 #include "VERSION"
@@ -140,36 +142,45 @@ static void cat(struct in *in, FILE *out, dword n) {
   }
 }
 
+static void utf8txt(uint16_t c) {
+  if(c <= 0) return;
+  if(c < 0x80) { // 0xxxxxxx
+    switch(c) {
+    case '\r': puts(""); break;
+    default: putchar(c);
+    }
+  } else if(c < 0x800) { // 110xxxxx  10xxxxxx
+    putchar(0xc0 | (c >> 6));
+    putchar(0x80 | (0x3f & c));
+  } else if(c < 0x10000) { // 1110xxxx  10xxxxxx  10xxxxxx
+    putchar(0xe0 | (c >> 12));
+    putchar(0x80 | (0x3f & (c >> 6)));
+    putchar(0x80 | (0x3f & c));
+  }
+}
+
 static void txt(struct RecordHeader *h, struct in *in) {
   switch(h->recType) {
   case 0x0fa0: // RT_TextCharsAtom utf16le
   case 0x0fba: // RT_CString
     for(int i = 0; i < h->recLen; i += 2) {
-      short c;
+      uint16_t c;
       if(1 != in_read(&c, 2, 1, in)) {
         fprintf(stderr, "unexpected end of file\n");
         exit(1);
       }
-      if(0 < c && c < 0x80) {
-        switch(c) {
-        case 0x0d: puts(""); break;
-        default: putchar(c); // TODO whole utf
-        }
-      }
+      utf8txt(c);
     }
     puts("");
     break;
   case 0x0fa8: // RT_TextBytesAtom ascii
     for(int i = 0; i < h->recLen; i++) {
-      char c;
+      uint8_t c;
       if(1 != in_read(&c, 1, 1, in)) {
         fprintf(stderr, "unexpected end of file\n");
         exit(1);
       }
-      switch(c) {
-      case 0x0d: puts(""); break;
-      default: putchar(c); // TODO whole utf
-      }
+      utf8txt(c);
     }
     puts("");
     break;
@@ -183,38 +194,51 @@ static void txt(struct RecordHeader *h, struct in *in) {
   }
 }
 
+static void utf8html(uint16_t c) {
+  if(c <= 0) return;
+  if(c < 0x80) { // 0xxxxxxx
+    switch(c) {
+    case '&': printf("&amp;"); break;
+    case '<': printf("&lt;"); break;
+    case '>': printf("&gt;"); break;
+    case '\'': printf("&quot;"); break;
+    case '\r': puts("<br>"); break;
+    default: putchar(c);
+    }
+  } else if(c < 0x800) { // 110xxxxx  10xxxxxx
+    putchar(0xc0 | (c >> 6));
+    putchar(0x80 | (0x3f & c));
+  } else if(c < 0x10000) { // 1110xxxx  10xxxxxx  10xxxxxx
+    putchar(0xe0 | (c >> 12));
+    putchar(0x80 | (0x3f & (c >> 6)));
+    putchar(0x80 | (0x3f & c));
+  }
+}
+
 static void html(struct RecordHeader *h, struct in *in) {
   switch(h->recType) {
   case 0x0fa0: // RT_TextCharsAtom utf16le
   case 0x0fba: // RT_CString
     printf("<p>");
     for(int i = 0; i < h->recLen; i += 2) {
-      short c;
+      uint16_t c;
       if(1 != in_read(&c, 2, 1, in)) {
         fprintf(stderr, "unexpected end of file\n");
         exit(1);
       }
-      if(0 < c && c < 0x80) {
-        switch(c) {
-        case 0x0d: puts("<br>"); break;
-        default: putchar(c); // TODO whole utf
-        }
-      }
+      utf8html(c);
     }
     puts("</p>");
     break;
   case 0x0fa8: // RT_TextBytesAtom ascii
     printf("<p>");
     for(int i = 0; i < h->recLen; i++) {
-      char c;
+      uint8_t c;
       if(1 != in_read(&c, 1, 1, in)) {
         fprintf(stderr, "unexpected end of file\n");
         exit(1);
       }
-      switch(c) {
-      case 0x0d: puts("<br>"); break;
-      default: putchar(c); // TODO whole utf
-      }
+      utf8html(c);
     }
     puts("</p>");
     break;
diff --git a/ppt2html b/ppt2html
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+d=$(mktemp -d -q)
+(
+    cd $d
+    cfb cat "$1" '/Root Entry/PowerPoint Document' >.doc
+    ppt html .doc >index.html
+    rm .doc
+    cfb cat "$1" '/Root Entry/Pictures' >.pic
+    ppt extract .pic
+    rm .pic
+    ls * \
+        | grep -v index.html \
+        | sort -n \
+        | xargs -n 1 -I{} echo '<img src="{}">' >>index.html
+    echo $d
+)
diff --git a/unppt b/unppt
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+set -euo pipefail
+cfb cat "$1" '/Root Entry/PowerPoint Document' | ppt txt
diff --git a/utf8.c b/utf8.c
@@ -1,151 +0,0 @@
-/*
-   pptHtml - Format a PowerPoint Presentation into Html
-   Copyright 2002 Charles N Wyble <jackshck@yahoo.com>	
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published  by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
- */
-
-#include "utf8.h"
-
-#include "stdio.h"
-
-void OutputCharCorrected(unsigned char c)
-{
-	switch (c)
-	{	/* Special char handlers here... */
-		case '\r':
-			printf("<BR>\n");
-			break;
-		case 0x3C:
-			printf("&lt;");
-			break;
-		case 0x3E:
-			printf("&gt;");
-			break;
-		case 0x26:
-			printf("&amp;");
-			break;
-		case 0x22:
-			printf("&quot;");
-			break;
-		/* Also need to cover 128-159 since MS uses this area... */
-		case 0x80:		/* Euro Symbol */
-			printf("&#8364;");
-			break;
-		case 0x82:		/* baseline single quote */
-			printf("&#8218;");
-			break;
-		case 0x83:		/* florin */
-			printf("&#402;");
-			break;
-		case 0x84:		/* baseline double quote */
-			printf("&#8222;");
-			break;
-		case 0x85:		/* ellipsis */
-			printf("&#8230;");
-			break;
-		case 0x86:		/* dagger */
-		    printf("&#8224;");
-		    break;
-		case 0x87:		/* double dagger */
-		    printf("&#8225;");
-		    break;
-		case 0x88:		/* circumflex accent */
-		    printf("&#710;");
-		    break;
-		case 0x89:		/* permile */
-		    printf("&#8240;");
-		    break;
-		case 0x8A:		/* S Hacek */
-		    printf("&#352;");
-		    break;
-		case 0x8B:		/* left single guillemet */
-		    printf("&#8249;");
-		    break;
-		case 0x8C:		/* OE ligature */
-		    printf("&#338;");
-		    break;
-		case 0x8E:		/*  #LATIN CAPITAL LETTER Z WITH CARON */
-			printf("&#381;");
-			break;
-		case 0x91:		/* left single quote ? */
-		    printf("&#8216;");
-		    break;
-		case 0x92:		/* right single quote ? */
-		    printf("&#8217;");
-		    break;
-		case 0x93:		/* left double quote */
-		    printf("&#8220;");
-		    break;
-		case 0x94:		/* right double quote */
-		    printf("&#8221;");
-		    break;
-		case 0x95:		/* bullet */
-		    printf("&#8226;");
-		    break;
-		case 0x96:		/* endash */
-		    printf("&#8211;");
-		    break;
-		case 0x97:		/* emdash */
-		    printf("&#8212;");
-		    break;
-		case 0x98:		/* tilde accent */
-		    printf("&#732;");
-		    break;
-		case 0x99:		/* trademark ligature */
-		    printf("&#8482;");
-		    break;
-		case 0x9A:		/* s Haceks Hacek */
-		    printf("&#353;");
-		    break;
-		case 0x9B:		/* right single guillemet */
-		    printf("&#8250;");
-		    break;
-		case 0x9C:		/* oe ligature */
-		    printf("&#339;");
-		    break;
-		case 0x9F:		/* Y Dieresis */
-		    printf("&#376;");
-		    break;
-		default:
-			putchar(c);
-			break;
-	}
-}
-
-void print_utf8(unsigned short c)
-{
-	if (c == 0)
-		return;
-		
-	if (c < 0x80)
-		OutputCharCorrected(c);
-	else if (c < 0x800)
-	{
-		putchar(0xC0 | (c >>  6));
-		put_utf8(c);
-	}
-	else
-	{
-		putchar(0xE0 | (c >> 12));
-		put_utf8(c >>  6);
-		put_utf8(c);
-	}
-}
-
-void put_utf8(unsigned short c)
-{
-	putchar(0x0080 | ((short)c & 0x003F));
-}

	olefs command line tools to extract data from OLE documents like doc, ppt, xls, msg
	git clone https://logand.com/git/olefs.git/
	Log \| Files \| Refs

M	cfb.c	\|	7	++-----
M	default.nix	\|	2	+-
M	ppt.c	\|	72	++++++++++++++++++++++++++++++++++++++++++++++++------------------------
A	ppt2html	\|	17	+++++++++++++++++
A	unppt	\|	3	+++
D	utf8.c	\|	151	------------------------------------------------------------------------------