commit 9a3b0a6df06fd40591986a6b1bcd5acf093bfb06
parent a9018212f9f623c9e1cf5b43b85b792894e7b193
Author: Tomas Hlavaty <tom@logand.com>
Date: Sun, 14 Jul 2019 23:35:30 +0200
utf8, unppt and ppt2html
Diffstat:
M | cfb.c | | | 7 | ++----- |
M | default.nix | | | 2 | +- |
M | ppt.c | | | 72 | ++++++++++++++++++++++++++++++++++++++++++++++++------------------------ |
A | ppt2html | | | 17 | +++++++++++++++++ |
A | unppt | | | 3 | +++ |
D | utf8.c | | | 151 | ------------------------------------------------------------------------------ |
6 files changed, 71 insertions(+), 181 deletions(-)
diff --git a/cfb.c b/cfb.c
@@ -1,4 +1,5 @@
// TODO version 4 with bigger sector size
+// TODO from pipe without seek
const char *VERSION =
#include "VERSION"
@@ -12,7 +13,6 @@ const char *VERSION =
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
-//#include <iconv.h> // TODO wchar -> utf8 properly
// MS-CFB Compound File Binary File Format
@@ -294,7 +294,6 @@ static void open_cfb_file(char *filename, struct cfb_file *x) {
fprintf(stderr, "Unable to open '%s'.\n", filename);
exit(1);
}
- //conv = iconv_open("UTF-8", "UTF-16LE"); //"UCS-2"); //"UCS2-LE");
x->stream = stream;
read_header(stream, &x->header);
check_header(&x->header);
@@ -388,9 +387,7 @@ static void cat(struct entry *e, struct cfb_file *f) {
}
}
-static size_t xconv(wchar *iname, char *oname, size_t length) {
- /* size_t ileft = length, oleft; */
- /* return iconv(conv, (char **) &iname, &ileft, &oname, &oleft); */
+static size_t xconv(wchar *iname, char *oname, size_t length) { // TODO utf8
int i;
for(i = 0; i < length / sizeof(wchar); i++)
oname[i] = iname[i];
diff --git a/default.nix b/default.nix
@@ -6,7 +6,7 @@ stdenv.mkDerivation rec {
src = ./.;
installPhase = ''
mkdir -p $out/bin
- cp cfb ppt $out/bin
+ cp cfb ppt unppt ppt2html $out/bin
'';
meta = {
license = stdenv.lib.licenses.gpl3Plus;
diff --git a/ppt.c b/ppt.c
@@ -1,4 +1,6 @@
// TODO proper little endian read/write
+// TODO pic in place instead of appended
+// TODO limit mem like timeout
const char *VERSION =
#include "VERSION"
@@ -140,36 +142,45 @@ static void cat(struct in *in, FILE *out, dword n) {
}
}
+static void utf8txt(uint16_t c) {
+ if(c <= 0) return;
+ if(c < 0x80) { // 0xxxxxxx
+ switch(c) {
+ case '\r': puts(""); break;
+ default: putchar(c);
+ }
+ } else if(c < 0x800) { // 110xxxxx 10xxxxxx
+ putchar(0xc0 | (c >> 6));
+ putchar(0x80 | (0x3f & c));
+ } else if(c < 0x10000) { // 1110xxxx 10xxxxxx 10xxxxxx
+ putchar(0xe0 | (c >> 12));
+ putchar(0x80 | (0x3f & (c >> 6)));
+ putchar(0x80 | (0x3f & c));
+ }
+}
+
static void txt(struct RecordHeader *h, struct in *in) {
switch(h->recType) {
case 0x0fa0: // RT_TextCharsAtom utf16le
case 0x0fba: // RT_CString
for(int i = 0; i < h->recLen; i += 2) {
- short c;
+ uint16_t c;
if(1 != in_read(&c, 2, 1, in)) {
fprintf(stderr, "unexpected end of file\n");
exit(1);
}
- if(0 < c && c < 0x80) {
- switch(c) {
- case 0x0d: puts(""); break;
- default: putchar(c); // TODO whole utf
- }
- }
+ utf8txt(c);
}
puts("");
break;
case 0x0fa8: // RT_TextBytesAtom ascii
for(int i = 0; i < h->recLen; i++) {
- char c;
+ uint8_t c;
if(1 != in_read(&c, 1, 1, in)) {
fprintf(stderr, "unexpected end of file\n");
exit(1);
}
- switch(c) {
- case 0x0d: puts(""); break;
- default: putchar(c); // TODO whole utf
- }
+ utf8txt(c);
}
puts("");
break;
@@ -183,38 +194,51 @@ static void txt(struct RecordHeader *h, struct in *in) {
}
}
+static void utf8html(uint16_t c) {
+ if(c <= 0) return;
+ if(c < 0x80) { // 0xxxxxxx
+ switch(c) {
+ case '&': printf("&"); break;
+ case '<': printf("<"); break;
+ case '>': printf(">"); break;
+ case '\'': printf("""); break;
+ case '\r': puts("<br>"); break;
+ default: putchar(c);
+ }
+ } else if(c < 0x800) { // 110xxxxx 10xxxxxx
+ putchar(0xc0 | (c >> 6));
+ putchar(0x80 | (0x3f & c));
+ } else if(c < 0x10000) { // 1110xxxx 10xxxxxx 10xxxxxx
+ putchar(0xe0 | (c >> 12));
+ putchar(0x80 | (0x3f & (c >> 6)));
+ putchar(0x80 | (0x3f & c));
+ }
+}
+
static void html(struct RecordHeader *h, struct in *in) {
switch(h->recType) {
case 0x0fa0: // RT_TextCharsAtom utf16le
case 0x0fba: // RT_CString
printf("<p>");
for(int i = 0; i < h->recLen; i += 2) {
- short c;
+ uint16_t c;
if(1 != in_read(&c, 2, 1, in)) {
fprintf(stderr, "unexpected end of file\n");
exit(1);
}
- if(0 < c && c < 0x80) {
- switch(c) {
- case 0x0d: puts("<br>"); break;
- default: putchar(c); // TODO whole utf
- }
- }
+ utf8html(c);
}
puts("</p>");
break;
case 0x0fa8: // RT_TextBytesAtom ascii
printf("<p>");
for(int i = 0; i < h->recLen; i++) {
- char c;
+ uint8_t c;
if(1 != in_read(&c, 1, 1, in)) {
fprintf(stderr, "unexpected end of file\n");
exit(1);
}
- switch(c) {
- case 0x0d: puts("<br>"); break;
- default: putchar(c); // TODO whole utf
- }
+ utf8html(c);
}
puts("</p>");
break;
diff --git a/ppt2html b/ppt2html
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+d=$(mktemp -d -q)
+(
+ cd $d
+ cfb cat "$1" '/Root Entry/PowerPoint Document' >.doc
+ ppt html .doc >index.html
+ rm .doc
+ cfb cat "$1" '/Root Entry/Pictures' >.pic
+ ppt extract .pic
+ rm .pic
+ ls * \
+ | grep -v index.html \
+ | sort -n \
+ | xargs -n 1 -I{} echo '<img src="{}">' >>index.html
+ echo $d
+)
diff --git a/unppt b/unppt
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+set -euo pipefail
+cfb cat "$1" '/Root Entry/PowerPoint Document' | ppt txt
diff --git a/utf8.c b/utf8.c
@@ -1,151 +0,0 @@
-/*
- pptHtml - Format a PowerPoint Presentation into Html
- Copyright 2002 Charles N Wyble <jackshck@yahoo.com>
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include "utf8.h"
-
-#include "stdio.h"
-
-void OutputCharCorrected(unsigned char c)
-{
- switch (c)
- { /* Special char handlers here... */
- case '\r':
- printf("<BR>\n");
- break;
- case 0x3C:
- printf("<");
- break;
- case 0x3E:
- printf(">");
- break;
- case 0x26:
- printf("&");
- break;
- case 0x22:
- printf(""");
- break;
- /* Also need to cover 128-159 since MS uses this area... */
- case 0x80: /* Euro Symbol */
- printf("€");
- break;
- case 0x82: /* baseline single quote */
- printf("‚");
- break;
- case 0x83: /* florin */
- printf("ƒ");
- break;
- case 0x84: /* baseline double quote */
- printf("„");
- break;
- case 0x85: /* ellipsis */
- printf("…");
- break;
- case 0x86: /* dagger */
- printf("†");
- break;
- case 0x87: /* double dagger */
- printf("‡");
- break;
- case 0x88: /* circumflex accent */
- printf("ˆ");
- break;
- case 0x89: /* permile */
- printf("‰");
- break;
- case 0x8A: /* S Hacek */
- printf("Š");
- break;
- case 0x8B: /* left single guillemet */
- printf("‹");
- break;
- case 0x8C: /* OE ligature */
- printf("Œ");
- break;
- case 0x8E: /* #LATIN CAPITAL LETTER Z WITH CARON */
- printf("Ž");
- break;
- case 0x91: /* left single quote ? */
- printf("‘");
- break;
- case 0x92: /* right single quote ? */
- printf("’");
- break;
- case 0x93: /* left double quote */
- printf("“");
- break;
- case 0x94: /* right double quote */
- printf("”");
- break;
- case 0x95: /* bullet */
- printf("•");
- break;
- case 0x96: /* endash */
- printf("–");
- break;
- case 0x97: /* emdash */
- printf("—");
- break;
- case 0x98: /* tilde accent */
- printf("˜");
- break;
- case 0x99: /* trademark ligature */
- printf("™");
- break;
- case 0x9A: /* s Haceks Hacek */
- printf("š");
- break;
- case 0x9B: /* right single guillemet */
- printf("›");
- break;
- case 0x9C: /* oe ligature */
- printf("œ");
- break;
- case 0x9F: /* Y Dieresis */
- printf("Ÿ");
- break;
- default:
- putchar(c);
- break;
- }
-}
-
-void print_utf8(unsigned short c)
-{
- if (c == 0)
- return;
-
- if (c < 0x80)
- OutputCharCorrected(c);
- else if (c < 0x800)
- {
- putchar(0xC0 | (c >> 6));
- put_utf8(c);
- }
- else
- {
- putchar(0xE0 | (c >> 12));
- put_utf8(c >> 6);
- put_utf8(c);
- }
-}
-
-void put_utf8(unsigned short c)
-{
- putchar(0x0080 | ((short)c & 0x003F));
-}