olefs

command line tools to extract data from OLE documents like doc, ppt, xls, msg
git clone https://logand.com/git/olefs.git/
Log | Files | Refs

commit eb117a980e2fa75c0495dfcf193aaee474a6f48f
parent 8925af9af2a2f65141fa80022bda3ffd1136f8a0
Author: Tomas Hlavaty <tom@logand.com>
Date:   Mon, 30 May 2011 20:43:47 +0200

rhdump and ppt2html added

Diffstat:
MMakefile | 13++++++++++++-
Appt2html.c | 105+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Arhdump.c | 64++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Autf8.c | 151++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Autf8.h | 3+++
5 files changed, 335 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile @@ -1,4 +1,4 @@ -ALL=cfbfs odrawfs +ALL=cfbfs odrawfs rhdump ppt2html #-std=c99 CFLAGS=-g -Wall #CFLAGS=-Wall -O2 @@ -16,5 +16,16 @@ odrawfs: odrawfs.c $(CC) $(CFLAGS) $(CFLAGSFUSE) -o $@ $< $(LDFLAGS) $(LDFLAGSFUSE) # strip $@ +rhdump: rhdump.c + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) +# strip $@ + +utf8.o: utf8.c utf8.h + $(CC) $(CFLAGS) -o $@ -c $< + +ppt2html: ppt2html.c utf8.o + $(CC) $(CFLAGS) -o $@ $< utf8.o $(LDFLAGS) +# strip $@ + clean: rm -f $(ALL) diff --git a/ppt2html.c b/ppt2html.c @@ -0,0 +1,105 @@ +// TODO proper little endian read_ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <stdint.h> + +#include "utf8.h" + +typedef uint32_t dword; + +// MS-PPT PowerPoint (.ppt) Binary File Format + +struct RecordHeader { + ushort recVer: 4; //(logand #x0f %dummy1)) + ushort recInstance: 12; //(logior (ash %dummy2 4) (ash %dummy1 -4))) + ushort recType; + dword recLen; +} __attribute__((__packed__)); + +static size_t read_RecordHeader(FILE *stream, struct RecordHeader *x) { + return fread(x, sizeof(struct RecordHeader), 1, stream); +} + +static int slide_no = 0; + +static void out(FILE *stream, int level, int i, struct RecordHeader *h) { + int j; + /* for(j = 0; j < level; j++) */ + /* printf(" "); */ + /* printf("%d 0x%x 0x%x 0x%x %u\n", i, h->recVer, h->recInstance, h->recType, h->recLen); */ + switch(h->recType) { + case 0x0fa0: // RT_TextCharsAtom utf16le + case 0x0fba: // RT_CString + printf("<p>"); + for(j = 0; j < h->recLen; j += 2) { + short c; + fread(&c, 2, 1, stream); + print_utf8(c); + } + puts("</p>"); + break; + case 0x0fa8: // RT_TextBytesAtom ascii + printf("<p>"); + for(j = 0; j < h->recLen; j++) { + char c; + fread(&c, 1, 1, stream); + if(c == 0x0d) + printf("<br/>\n"); + else + putchar(c); + } + puts("</p>"); + break; + case 0x03ee: // RT_Slide + case 0x03e8: // RT_Document + if(0 < slide_no) + puts("<hr/>\n</div>"); + printf("<div class=\"slide\">\n<h1>Slide %d</h1>\n", ++slide_no); + } +} + +static void dump(FILE *stream, int level, dword pos) { + int i; + for(i = 0;; i++) { + if(0 < pos && pos <= ftell(stream)) + break; + struct RecordHeader h; + if(read_RecordHeader(stream, &h) <= 0) { + break; // EOF + } + size_t start = ftell(stream); + out(stream, level, i, &h); + if(0xf == h.recVer) { + dword n = ftell(stream) + h.recLen; + if(0 < pos) + n = n < pos ? n : pos; + dump(stream, 1 + level, n); + } else + fseek(stream, start + h.recLen, SEEK_SET); + } +} + +int main(int argc, char *argv[]) { + if(argc < 2) { + fprintf(stderr, "Usage: %s filename\n", argv[0]); + exit(-1); + } + char *filename = argv[1]; + FILE *stream = fopen(filename, "r"); + if(!stream) { + fprintf(stderr, "Unable to open '%s'.\n", filename); + exit(-1); + } + puts("<html>\n<head>"); + printf("<title>%s</title>\n", filename); + puts("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/>"); + puts("</head>\n<body>"); + dump(stream, 0, 0); + if(0 < slide_no) + puts("</div>"); + puts("</body>\n</html>"); + fclose(stream); + return 0; +} diff --git a/rhdump.c b/rhdump.c @@ -0,0 +1,64 @@ +// TODO proper little endian read_ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <stdint.h> + +typedef uint32_t dword; + +// MS-PPT PowerPoint (.ppt) Binary File Format + +struct RecordHeader { + ushort recVer: 4; //(logand #x0f %dummy1)) + ushort recInstance: 12; //(logior (ash %dummy2 4) (ash %dummy1 -4))) + ushort recType; + dword recLen; +} __attribute__((__packed__)); + +static size_t read_RecordHeader(FILE *stream, struct RecordHeader *x) { + return fread(x, sizeof(struct RecordHeader), 1, stream); +} + +static void out(int level, int i, struct RecordHeader *h) { + int j; + for(j = 0; j < level; j++) + printf(" "); + printf("%d 0x%x 0x%x 0x%x %u\n", i, h->recVer, h->recInstance, h->recType, h->recLen); +} + +static void dump(FILE *stream, int level, dword pos) { + int i; + for(i = 0;; i++) { + if(0 < pos && pos <= ftell(stream)) + break; + struct RecordHeader h; + if(read_RecordHeader(stream, &h) <= 0) { + break; // EOF + } + out(level, i, &h); + if(0xf == h.recVer) { + dword n = ftell(stream) + h.recLen; + if(0 < pos) + n = n < pos ? n : pos; + dump(stream, 1 + level, n); + } else + fseek(stream, h.recLen, SEEK_CUR); + } +} + +int main(int argc, char *argv[]) { + if(argc < 2) { + fprintf(stderr, "Usage: %s filename\n", argv[0]); + exit(-1); + } + char *filename = argv[1]; + FILE *stream = fopen(filename, "r"); + if(!stream) { + fprintf(stderr, "Unable to open '%s'.\n", filename); + exit(-1); + } + dump(stream, 0, 0); + fclose(stream); + return 0; +} diff --git a/utf8.c b/utf8.c @@ -0,0 +1,151 @@ +/* + pptHtml - Format a PowerPoint Presentation into Html + Copyright 2002 Charles N Wyble <jackshck@yahoo.com> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "utf8.h" + +#include "stdio.h" + +void OutputCharCorrected(unsigned char c) +{ + switch (c) + { /* Special char handlers here... */ + case '\r': + printf("<BR>\n"); + break; + case 0x3C: + printf("&lt;"); + break; + case 0x3E: + printf("&gt;"); + break; + case 0x26: + printf("&amp;"); + break; + case 0x22: + printf("&quot;"); + break; + /* Also need to cover 128-159 since MS uses this area... */ + case 0x80: /* Euro Symbol */ + printf("&#8364;"); + break; + case 0x82: /* baseline single quote */ + printf("&#8218;"); + break; + case 0x83: /* florin */ + printf("&#402;"); + break; + case 0x84: /* baseline double quote */ + printf("&#8222;"); + break; + case 0x85: /* ellipsis */ + printf("&#8230;"); + break; + case 0x86: /* dagger */ + printf("&#8224;"); + break; + case 0x87: /* double dagger */ + printf("&#8225;"); + break; + case 0x88: /* circumflex accent */ + printf("&#710;"); + break; + case 0x89: /* permile */ + printf("&#8240;"); + break; + case 0x8A: /* S Hacek */ + printf("&#352;"); + break; + case 0x8B: /* left single guillemet */ + printf("&#8249;"); + break; + case 0x8C: /* OE ligature */ + printf("&#338;"); + break; + case 0x8E: /* #LATIN CAPITAL LETTER Z WITH CARON */ + printf("&#381;"); + break; + case 0x91: /* left single quote ? */ + printf("&#8216;"); + break; + case 0x92: /* right single quote ? */ + printf("&#8217;"); + break; + case 0x93: /* left double quote */ + printf("&#8220;"); + break; + case 0x94: /* right double quote */ + printf("&#8221;"); + break; + case 0x95: /* bullet */ + printf("&#8226;"); + break; + case 0x96: /* endash */ + printf("&#8211;"); + break; + case 0x97: /* emdash */ + printf("&#8212;"); + break; + case 0x98: /* tilde accent */ + printf("&#732;"); + break; + case 0x99: /* trademark ligature */ + printf("&#8482;"); + break; + case 0x9A: /* s Haceks Hacek */ + printf("&#353;"); + break; + case 0x9B: /* right single guillemet */ + printf("&#8250;"); + break; + case 0x9C: /* oe ligature */ + printf("&#339;"); + break; + case 0x9F: /* Y Dieresis */ + printf("&#376;"); + break; + default: + putchar(c); + break; + } +} + +void print_utf8(unsigned short c) +{ + if (c == 0) + return; + + if (c < 0x80) + OutputCharCorrected(c); + else if (c < 0x800) + { + putchar(0xC0 | (c >> 6)); + put_utf8(c); + } + else + { + putchar(0xE0 | (c >> 12)); + put_utf8(c >> 6); + put_utf8(c); + } +} + +void put_utf8(unsigned short c) +{ + putchar(0x0080 | ((short)c & 0x003F)); +} diff --git a/utf8.h b/utf8.h @@ -0,0 +1,3 @@ +void OutputCharCorrected(unsigned char c); +void print_utf8(unsigned short c); +void put_utf8(unsigned short c);