olefs

command line tools to extract data from OLE documents like doc, ppt, xls, msg
git clone https://logand.com/git/olefs.git/
Log | Files | Refs

ppt.c (12003B)


      1 // TODO proper little endian read/write
      2 // TODO pic in place instead of appended
      3 // TODO limit mem like timeout
      4 
      5 const char *VERSION =
      6 #include "VERSION"
      7   ;
      8 
      9 #include <stdlib.h>
     10 #include <stdio.h>
     11 #include <string.h>
     12 #include <errno.h>
     13 #include <unistd.h>
     14 #include <stdint.h>
     15 #include <assert.h>
     16 #include <limits.h>
     17 
     18 enum cmd {CMD_LS, CMD_CAT, CMD_TXT, CMD_HTML, CMD_EXTRACT};
     19 
     20 typedef uint8_t byte;
     21 typedef uint32_t dword;
     22 
     23 // MS-PPT PowerPoint (.ppt) Binary File Format
     24 
     25 struct RecordHeader {
     26   ushort recVer: 4; //(logand #x0f %dummy1))
     27   ushort recInstance: 12;  //(logior (ash %dummy2 4) (ash %dummy1 -4)))
     28   ushort recType;
     29   dword recLen;
     30 } __attribute__((__packed__));
     31 
     32 struct in {
     33   FILE *stream;
     34   long size;
     35   char *buf;
     36   long max;
     37   long pos;
     38 };
     39 
     40 static long in_tell(struct in *in) {
     41   if(in->buf) {
     42     return in->pos;
     43   }
     44   long z = ftell(in->stream);
     45   if(z < 0) {
     46     fprintf(stderr, "stream not seekable\n");
     47     exit(1);
     48   }
     49   return z;
     50 }
     51 
     52 #define MB (1024 * 1024)
     53 
     54 static void in_open(struct in *in, char *filename) {
     55   in->stream = filename ? fopen(filename, "r") : stdin;
     56   if(!in->stream) {
     57     fprintf(stderr, "Unable to open '%s'.\n", filename);
     58     exit(1);
     59   }
     60   in->size = 0;
     61   in->buf = NULL;
     62   in->max = 0;
     63   in->pos = 0;
     64   if(ftell(in->stream) < 0) { /* not seekable */
     65     in->size = MB;
     66     in->buf = malloc(in->size);
     67     in->max = 0;
     68     while(!feof(in->stream)) {
     69       if(in->size <= in->max) {
     70         in->size += MB;
     71         in->buf = realloc(in->buf, in->size);
     72       }
     73       size_t n = fread(&in->buf[in->max], 1, in->size - in->max, in->stream);
     74       if(n < 0) {
     75         fprintf(stderr, "unexpected end of file\n");
     76         exit(1);
     77       }
     78       in->max += n;
     79     }
     80   }
     81 }
     82 
     83 static void in_close(struct in *in) {
     84   free(in->buf);
     85   fclose(in->stream);
     86 }
     87 
     88 static size_t in_read(void *ptr, size_t size, size_t nmemb, struct in *in) {
     89   if(in->buf) {
     90     size_t n1 = size * nmemb;
     91     size_t n2 = n1 <= in->max - in->pos ? n1 : in->max - in->pos;
     92     memcpy(ptr, &in->buf[in->pos], n2);
     93     in->pos += n2;
     94     if(n2 % size) {
     95       fprintf(stderr, "partial data read\n");
     96       exit(1);
     97     }
     98     return n2 / size;
     99   }
    100   return fread(ptr, size, nmemb, in->stream);
    101 }
    102 
    103 static void in_seek(struct in *in, long offset) {
    104   int z = 1;
    105   if(in->buf) {
    106     if(offset <= in->max) {
    107       in->pos = offset;
    108       return;
    109     }
    110   } else {
    111     z = fseek(in->stream, offset, SEEK_SET);
    112   }
    113   if(z) {
    114     fprintf(stderr, "seek to %ld failed\n", offset);
    115     exit(1);
    116   }
    117 }
    118 
    119 static size_t read_RecordHeader(struct in *in, struct RecordHeader *x) {
    120   return in_read(x, sizeof(struct RecordHeader), 1, in);
    121 }
    122 
    123 static size_t write_RecordHeader(struct RecordHeader *x) {
    124   return fwrite(x, sizeof(struct RecordHeader), 1, stdout);
    125 }
    126 
    127 static void cat(struct in *in, FILE *out, dword n) {
    128   for(dword i = 0; i < n;) {
    129     char buf[4096];
    130     dword m = n - i;
    131     size_t n1 = in_read(buf, 1, m <= 4096 ? m : 4096, in);
    132     if(n1 <= 0) {
    133       fprintf(stderr, "unexpected end of file\n");
    134       exit(1);
    135     }
    136     size_t n2 = fwrite(buf, 1, n1, out);
    137     if(n1 != n2) {
    138       fprintf(stderr, "output failed\n");
    139       exit(1);
    140     }
    141     i += n1;
    142   }
    143 }
    144 
    145 static void utf8txt(uint16_t c) {
    146   if(c <= 0) return;
    147   if(c < 0x80) { // 0xxxxxxx
    148     switch(c) {
    149     case '\r': puts(""); break;
    150     default: putchar(c);
    151     }
    152   } else if(c < 0x800) { // 110xxxxx  10xxxxxx
    153     putchar(0xc0 | (c >> 6));
    154     putchar(0x80 | (0x3f & c));
    155   } else if(c < 0x10000) { // 1110xxxx  10xxxxxx  10xxxxxx
    156     putchar(0xe0 | (c >> 12));
    157     putchar(0x80 | (0x3f & (c >> 6)));
    158     putchar(0x80 | (0x3f & c));
    159   }
    160 }
    161 
    162 static void txt(struct RecordHeader *h, struct in *in) {
    163   switch(h->recType) {
    164   case 0x0fa0: // RT_TextCharsAtom utf16le
    165   case 0x0fba: // RT_CString
    166     for(int i = 0; i < h->recLen; i += 2) {
    167       uint16_t c;
    168       if(1 != in_read(&c, 2, 1, in)) {
    169         fprintf(stderr, "unexpected end of file\n");
    170         exit(1);
    171       }
    172       utf8txt(c);
    173     }
    174     puts("");
    175     break;
    176   case 0x0fa8: // RT_TextBytesAtom ascii
    177     for(int i = 0; i < h->recLen; i++) {
    178       uint8_t c;
    179       if(1 != in_read(&c, 1, 1, in)) {
    180         fprintf(stderr, "unexpected end of file\n");
    181         exit(1);
    182       }
    183       utf8txt(c);
    184     }
    185     puts("");
    186     break;
    187   case 0x03ee: // RT_Slide
    188   case 0x03e8: // RT_Document
    189     {
    190       static int slide = 0;
    191       if(0 < slide) puts("");
    192       slide++;
    193     }
    194   }
    195 }
    196 
    197 static void utf8html(uint16_t c) {
    198   if(c <= 0) return;
    199   if(c < 0x80) { // 0xxxxxxx
    200     switch(c) {
    201     case '&': printf("&amp;"); break;
    202     case '<': printf("&lt;"); break;
    203     case '>': printf("&gt;"); break;
    204     case '\'': printf("&quot;"); break;
    205     case '\r': puts("<br>"); break;
    206     default: putchar(c);
    207     }
    208   } else if(c < 0x800) { // 110xxxxx  10xxxxxx
    209     putchar(0xc0 | (c >> 6));
    210     putchar(0x80 | (0x3f & c));
    211   } else if(c < 0x10000) { // 1110xxxx  10xxxxxx  10xxxxxx
    212     putchar(0xe0 | (c >> 12));
    213     putchar(0x80 | (0x3f & (c >> 6)));
    214     putchar(0x80 | (0x3f & c));
    215   }
    216 }
    217 
    218 static void html(struct RecordHeader *h, struct in *in) {
    219   switch(h->recType) {
    220   case 0x0fa0: // RT_TextCharsAtom utf16le
    221   case 0x0fba: // RT_CString
    222     printf("<p>");
    223     for(int i = 0; i < h->recLen; i += 2) {
    224       uint16_t c;
    225       if(1 != in_read(&c, 2, 1, in)) {
    226         fprintf(stderr, "unexpected end of file\n");
    227         exit(1);
    228       }
    229       utf8html(c);
    230     }
    231     puts("</p>");
    232     break;
    233   case 0x0fa8: // RT_TextBytesAtom ascii
    234     printf("<p>");
    235     for(int i = 0; i < h->recLen; i++) {
    236       uint8_t c;
    237       if(1 != in_read(&c, 1, 1, in)) {
    238         fprintf(stderr, "unexpected end of file\n");
    239         exit(1);
    240       }
    241       utf8html(c);
    242     }
    243     puts("</p>");
    244     break;
    245   case 0x03ee: // RT_Slide
    246   case 0x03e8: // RT_Document
    247     {
    248       static int slide = 0;
    249       if(0 < slide) puts("<hr/>");
    250       slide++;
    251       printf("<h1>Slide %d</h1>\n", slide);
    252     }
    253   }
    254 }
    255 
    256 // MS-ODRAW Office Drawing Binary File Format
    257 
    258 struct POINT {
    259   dword x;
    260   dword y;
    261 } __attribute__((__packed__));
    262 
    263 struct RECT {
    264   dword left;
    265   dword top;
    266   dword right;
    267   dword bottom;
    268 } __attribute__((__packed__));
    269 
    270 struct OfficeArtMetafileHeader {
    271   dword cbSize;
    272   struct RECT rcBounds;
    273   struct POINT ptSize;
    274   dword cbSave;
    275   byte compression; // :member '(#x00 #xfe))
    276   byte filter; //:always #xfe))
    277 } __attribute__((__packed__));
    278 
    279 static size_t read_OfficeArtMetafileHeader(struct in *in,
    280                                            struct OfficeArtMetafileHeader *x) {
    281   return in_read(x, 1, sizeof(struct OfficeArtMetafileHeader), in);
    282 }
    283 
    284 static size_t read_guid(struct in *in, byte guid[]) {
    285   return in_read(guid, 1, 16, in);
    286 }
    287 
    288 static const struct OfficeArtBlip_config {
    289   ushort recType;
    290   ushort recInstance[4];
    291   char *ext;
    292   ushort guid2[2];
    293   int metafileHeader;
    294 } OfficeArtBlip_config[] = {
    295   {0xf01a, {0x3d4, 0x3d5,     0,     0}, "emf",  {0x3d5,     0}, 1},
    296   {0xf01b, {0x216, 0x217,     0,     0}, "wmf",  {0x217,     0}, 1},
    297   {0xf01c, {0x542, 0x543,     0,     0}, "pict", {0x543,     0}, 1},
    298   {0xf01d, {0x46a, 0x46b, 0x6e2, 0x6e3}, "jpeg", {0x46b, 0x6e3}, 0},
    299   {0xf01e, {0x6e0, 0x6e1,     0,     0}, "png",  {0x6e1,     0}, 0},
    300   {0xf01f, {0x7a8, 0x7a9,     0,     0}, "dib",  {0x7a9,     0}, 0},
    301   {0xf029, {0x6e4, 0x6e5,     0,     0}, "tiff", {0x6e5,     0}, 0},
    302   {0xf02a, {0x46a, 0x46b, 0x6e2, 0x6e3}, "jpeg", {0x46b, 0x6e3}, 0},
    303   {0}
    304 };
    305 
    306 static void extract(struct RecordHeader *h, struct in *in) {
    307   static int img = 0;
    308   for(int i = 0; OfficeArtBlip_config[i].recType; i++) {
    309     const struct OfficeArtBlip_config *c = &OfficeArtBlip_config[i];
    310     if(h->recType == c->recType) {
    311       char filename[PATH_MAX];
    312       snprintf(filename, PATH_MAX, "%d.%s", img++, c->ext);
    313       size_t n = 0;
    314       byte guid[16];
    315       n += read_guid(in, guid);
    316       if(h->recInstance == c->guid2[0]
    317          || (c->guid2[1] && h->recInstance == c->guid2[1]))
    318         n += read_guid(in, guid);
    319       if(c->metafileHeader) {
    320         struct OfficeArtMetafileHeader h2;
    321         n += read_OfficeArtMetafileHeader(in, &h2);
    322       } else {
    323         byte b;
    324         n += in_read(&b, 1, 1, in);
    325       }
    326       FILE *f = fopen(filename, "w");
    327       cat(in, f, h->recLen - n);
    328       fclose(f);
    329     }
    330   }
    331 }
    332 
    333 static void out(struct RecordHeader *h, struct in *in, int level, int i, int xlevel, int xi, enum cmd c) {
    334   switch(c) {
    335   case CMD_LS:
    336     printf("%4d %4d 0x%04x 0x%04x 0x%04x %10u\n",
    337            level, i, h->recVer, h->recInstance, h->recType, h->recLen);
    338     break;
    339   case CMD_CAT:
    340     if(level == xlevel && i == xi) {
    341       if(1 != write_RecordHeader(h)) {
    342         fprintf(stderr, "output failed\n");
    343         exit(1);
    344       }
    345       cat(in, stdout, h->recLen);
    346       exit(0);
    347     }
    348   case CMD_TXT: txt(h, in); break;
    349   case CMD_HTML: html(h, in); break;
    350   case CMD_EXTRACT: extract(h, in); break;
    351   }
    352 }
    353 
    354 static void walk(struct in *in, int level, dword pos, int xlevel, int xi, enum cmd cmd) {
    355   assert(0 <= level);
    356   assert(0 <= pos);
    357   for(int i = 0;; i++) {
    358     if(0 < pos && pos <= in_tell(in))
    359       break;
    360     struct RecordHeader h;
    361     size_t n = read_RecordHeader(in, &h);
    362     if(n <= 0) {
    363       break; // EOF
    364     }
    365     if(1 != n) {
    366       fprintf(stderr, "error reading record header\n");
    367       exit(1);
    368     }
    369     size_t start = in_tell(in), end = start + h.recLen;
    370     out(&h, in, level, i, xlevel, xi, cmd);
    371     if(0xf == h.recVer) {
    372       if(0 < pos)
    373         end = end < pos ? end : pos;
    374       walk(in, 1 + level, end, xlevel, xi, cmd);
    375     } else
    376       in_seek(in, end);
    377   }
    378   // TODO xlevel xi not found -> exit(1)
    379 }
    380 
    381 static int cmd_ls(char *argv[]) {
    382   char *filename = argv[0];
    383   struct in in;
    384   in_open(&in, filename);
    385   walk(&in, 0, 0, -1, -1, CMD_LS);
    386   in_close(&in);
    387   return 0;
    388 }
    389 
    390 static int cmd_cat(char *argv[]) {
    391   char *level = argv[0];
    392   if(!level) {
    393     fprintf(stderr, "level expected\n");
    394     return 1;
    395   }
    396   int xlevel;
    397   if(1 != sscanf(level, "%d", &xlevel)) {
    398     fprintf(stderr, "unknown level '%s'\n", level);
    399     return 1;
    400   }
    401   char *i = argv[1];
    402   if(!i) {
    403     fprintf(stderr, "i expected\n");
    404     return 1;
    405   }
    406   int xi;
    407   if(1 != sscanf(i, "%d", &xi)) {
    408     fprintf(stderr, "unknown i '%s'\n", i);
    409     return 1;
    410   }
    411   char *filename = argv[2];
    412   struct in in;
    413   in_open(&in, filename);
    414   walk(&in, 0, 0, xlevel, xi, CMD_CAT);
    415   in_close(&in);
    416   return 0;
    417 }
    418 
    419 static int cmd_txt(char *argv[]) {
    420   char *filename = argv[0];
    421   struct in in;
    422   in_open(&in, filename);
    423   walk(&in, 0, 0, -1, -1, CMD_TXT);
    424   in_close(&in);
    425   return 0;
    426 }
    427 
    428 static int cmd_html(char *argv[]) {
    429   char *filename = argv[0];
    430   struct in in;
    431   in_open(&in, filename);
    432   walk(&in, 0, 0, -1, -1, CMD_HTML);
    433   in_close(&in);
    434   return 0;
    435 }
    436 
    437 static int cmd_extract(char *argv[]) {
    438   char *filename = argv[0];
    439   struct in in;
    440   in_open(&in, filename);
    441   walk(&in, 0, 0, -1, -1, CMD_EXTRACT);
    442   in_close(&in);
    443   return 0;
    444 }
    445 
    446 static int cmd_help(void) {
    447   printf("usage:\n");
    448   printf("  ppt ls [filename]               list records\n");
    449   printf("  ppt cat level index [filename]  print record\n");
    450   printf("  ppt txt [filename]              print text\n");
    451   printf("  ppt html [filename]             print html\n");
    452   printf("  ppt extract [filename]          extract pictures\n");
    453   printf("  ppt --help                      print help\n");
    454   printf("  ppt --version                   print version\n");
    455   return 0;
    456 }
    457 
    458 static int cmd_version(void) {
    459   printf("%s\n", VERSION);
    460   return 0;
    461 }
    462 
    463 int main(int argc, char *argv[]) {
    464   char *cmd = *++argv;
    465   if(!cmd) {
    466     fprintf(stderr, "command expected\n");
    467     return 1;
    468   }
    469   ++argv;
    470   if(!strcmp("ls", cmd)) return cmd_ls(argv);
    471   else if(!strcmp("cat", cmd)) return cmd_cat(argv);
    472   else if(!strcmp("txt", cmd)) return cmd_txt(argv);
    473   else if(!strcmp("html", cmd)) return cmd_html(argv);
    474   else if(!strcmp("extract", cmd)) return cmd_extract(argv);
    475   else if(!strcmp("--help", cmd)) return cmd_help();
    476   else if(!strcmp("--version", cmd)) return cmd_version();
    477   else fprintf(stderr, "unexpected command %s\n", cmd);
    478   return 1;
    479 }