/* * app_multiplexed.c -- creates a multiplexed document for XHTML-Print UA * * References: * Applendix B.2 MIME type Application/Vnd.pwg-multiplexed * http://www.ietf.org/rfc/rfc3391.txt * http://www.w3.org/TR/xhtml-print/ * * Copyright (c) 2004 Hewlett-Packard Development Company, L.P. * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #if defined (_WIN32) #include #endif #include #include #include #include #include #include #define TRUE 1 #define FALSE 0 /* type of urls */ typedef enum { URL_abs_e, /* complete */ URL_rel_e, /* needs base */ URL_frag_e, /* portion */ URL_any_e, /* all of the above ? */ URL_illegal_e /* invalid */ } URL_type_t; /* url manipulation functions */ /* convert a potentially escaped URL to plain text */ int URL_convert_to_plain (const unsigned char *escaped_p, unsigned char *plain_p, int max_size); /* check the type of the url */ URL_type_t URL_check_url (const unsigned char *url_p); /* Combine base (base_p) and relative (relative_p) URLs into url_p.*/ int URL_combine_abs_and_rel (const unsigned char *base_p, const unsigned char *relative_p, unsigned char *url_p, int max_size); /* Extract base URL from url_p (plain) */ int URL_extract_base (const unsigned char *url_p, unsigned char *base_p, int max_size); #ifndef MIN #define MIN(a,b) (((a) <= (b)) ? (a) : (b)) #endif static int encodeHeaders = FALSE; static int OneObj = FALSE; static int ForceBeforeRoot = FALSE; static char *defaultBaseP; static int reuseMsgNumbers = FALSE; static char *Content_Disposition = NULL; static int ChunkSize = 1024; static void scan_xhtml_file (void); static void scan_css_file (void); static void scan_jpeg_file (void); /* double-linked list node */ typedef struct list_node_st { struct list_node_st *prev_p; struct list_node_st *next_p; void * data; /* allow list to hold any data object */ } list_node_t; /* initialize list */ void list_init(list_node_t* lst_p) { lst_p->prev_p = lst_p; lst_p->next_p = lst_p; } /* insert node before another */ void list_insert_before(list_node_t* node_p, list_node_t* new_p) { new_p->next_p = node_p; new_p->prev_p = node_p->prev_p; new_p->prev_p->next_p = new_p; node_p->prev_p = new_p; } /* remove node */ list_node_t* list_unlink(list_node_t* node_p) { node_p->next_p->prev_p = node_p->prev_p; node_p->prev_p->next_p = node_p->next_p; node_p->prev_p = node_p->next_p = node_p; return (node_p); } /* remove a node from the front of the list */ list_node_t* list_remove_front(list_node_t* lst_p) { assert(lst_p->prev_p != lst_p); return (list_unlink(lst_p->next_p)); } /* -------------------------------------- */ /* list of referenced objects */ typedef struct refobj_st { struct list_node_st children; struct refobj_st *parent_obj_p; int parent_offs; const struct extension_st *ext_p; /* how to manage this object */ const char *url_p; /* url of object*/ const char *type_p; int scanned; /* has been scanned ? */ int root; /* is root doc ? */ unsigned char *file_p; /* the contents */ unsigned int file_len; /* size of content */ char *content_hdr_p; int content_hdr_len; /* size of the MIME doc content header */ int id; /* obj id (used in the chunk header */ int msg; int output_offs; int img_hdr_size; } refobj_t; /* the list of referenced objects */ list_node_t refobjs; typedef struct interleave_group_st { /* unique members */ char **urls_p; int urls_cnt; } interleave_group_t; /* a list of interleaved chunks? */ list_node_t interleave_groups; /* a message chunk */ typedef struct chunk_node_st { int parent_offs; int length; } chunk_node_t; /* a squence of chunks */ typedef struct chunk_seq_st { refobj_t *obj_p; int chunks_cnt; chunk_node_t *chunks_p; int output_idx; int output_offs; } chunk_seq_t; /* prototypes to chucking functions */ static int image_get_parent_offs (refobj_t *obj_p); static int image_get_chunk_size (refobj_t *obj_p); static void image_get_chunk_seq (refobj_t *obj_p, chunk_seq_t *seq_p); /* how to scan and chunk a particular type of file */ typedef struct extension_st { const char *ext_p; /* file extension: content hint */ const char *type_p; /* MIME content type, see http://www.ietf.org/rfc/rfc1341.txt */ void (*scan)(void); /* scanning function */ /* chunking functions */ void (*get_chunk_seq)(refobj_t *obj_p, chunk_seq_t *seq_p); int (*get_parent_offs)(refobj_t *obj_p); int (*get_chunk_size)(refobj_t *obj_p); } extension_t; /* MIME content types, see http://www.ietf.org/rfc/rfc1341.txt */ #define TYPE_TEXT "text/plain" #define TYPE_CSS "text/css" #define TYPE_XHTML "application/vnd.pwg-xhtml-print+xml" #define TYPE_JPEG "image/jpeg" #define TYPE_GIF "image/gif" #define TYPE_PNG "image/png" #define TYPE_MULTIPLEXED "application/vnd.pwg-multiplexed" /* http://www.ietf.org/rfc/rfc3391.txt */ static const extension_t extensions[] = { { ".xhtml;.html;.htm;.HTM;.HTML;.XHTML", TYPE_XHTML, scan_xhtml_file, NULL, NULL, NULL }, { ".css;.CSS", TYPE_CSS, scan_css_file, NULL, NULL, NULL }, { ".jpg;.jpeg;.JPG;.JPEG", TYPE_JPEG, scan_jpeg_file, image_get_chunk_seq, image_get_parent_offs, image_get_chunk_size }, { ".gif;.GIF", TYPE_GIF, NULL, image_get_chunk_seq, image_get_parent_offs, image_get_chunk_size }, { ".png;.PNG", TYPE_PNG, NULL, image_get_chunk_seq, image_get_parent_offs, image_get_chunk_size }, { ".txt;.TXT", TYPE_TEXT, NULL, NULL, NULL, NULL }, { NULL, TYPE_TEXT, NULL, NULL, NULL, NULL } }; char *CharTranslateTbl[] = { "=00", /* U+0000 */ "=01", /* U+0001 */ "=02", /* U+0002 */ "=03", /* U+0003 */ "=04", /* U+0004 */ "=05", /* U+0005 */ "=06", /* U+0006 */ "=07", /* U+0007 */ "=08", /* U+0008 */ "=09", /* U+0009 */ "=0A", /* U+000A */ "=0B", /* U+000B */ "=0C", /* U+000C */ "=0D", /* U+000D */ "=0E", /* U+000E */ "=0F", /* U+000F */ "=10", /* U+0010 */ "=11", /* U+0011 */ "=12", /* U+0012 */ "=13", /* U+0013 */ "=14", /* U+0014 */ "=15", /* U+0015 */ "=16", /* U+0016 */ "=17", /* U+0017 */ "=18", /* U+0018 */ "=19", /* U+0019 */ "=1A", /* U+001A */ "=1B", /* U+001B */ "=1C", /* U+001C */ "=1D", /* U+001D */ "=1E", /* U+001E */ "=1F", /* U+001F */ "%20", /* U+0020 SPACE */ "=21", /* U+0021 EXCLAMATION MARK */ "=22", /* U+0022 QUOTATION MARK */ "%23", /* U+0023 NUMBER SIGN */ "=24", /* U+0024 DOLLAR SIGN */ "%25", /* U+0025 PERCENT SIGN */ "%26", /* U+0026 AMPERSAND */ "%27", /* U+0027 APOSTROPHE */ "=28", /* U+0028 LEFT PARENTHESIS */ "=29", /* U+0029 RIGHT PARENTHESIS */ "=2A", /* U+002A ASTERISK */ "=2B", /* U+002B PLUS SIGN */ "=2C", /* U+002C COMMA */ "=2D", /* U+002D HYPHEN-MINUS */ "=2E", /* U+002E FULL STOP */ "=2F", /* U+002F SOLIDUS */ "=30", /* U+0030 DIGIT ZERO */ "=31", /* U+0031 DIGIT ONE */ "=32", /* U+0032 DIGIT TWO */ "=33", /* U+0033 DIGIT THREE */ "=34", /* U+0034 DIGIT FOUR */ "=35", /* U+0035 DIGIT FIVE */ "=36", /* U+0036 DIGIT SIX */ "=37", /* U+0037 DIGIT SEVEN */ "=38", /* U+0038 DIGIT EIGHT */ "=39", /* U+0039 DIGIT NINE */ "=3A", /* U+003A COLON */ "=3B", /* U+003B SEMICOLON */ "=3C", /* U+003C LESS-THAN SIGN */ "=3D", /* U+003D EQUALS SIGN */ "=3E", /* U+003E GREATER-THAN SIGN */ "=3F", /* U+003F QUESTION MARK */ "=40", /* U+0040 COMMERCIAL AT */ "=41", /* U+0041 LATIN CAPITAL LETTER A */ "=42", /* U+0042 LATIN CAPITAL LETTER B */ "=43", /* U+0043 LATIN CAPITAL LETTER C */ "=44", /* U+0044 LATIN CAPITAL LETTER D */ "=45", /* U+0045 LATIN CAPITAL LETTER E */ "=46", /* U+0046 LATIN CAPITAL LETTER F */ "=47", /* U+0047 LATIN CAPITAL LETTER G */ "=48", /* U+0048 LATIN CAPITAL LETTER H */ "=49", /* U+0049 LATIN CAPITAL LETTER I */ "=4A", /* U+004A LATIN CAPITAL LETTER J */ "=4B", /* U+004B LATIN CAPITAL LETTER K */ "=4C", /* U+004C LATIN CAPITAL LETTER L */ "=4D", /* U+004D LATIN CAPITAL LETTER M */ "=4E", /* U+004E LATIN CAPITAL LETTER N */ "=4F", /* U+004F LATIN CAPITAL LETTER O */ "=50", /* U+0050 LATIN CAPITAL LETTER P */ "=51", /* U+0051 LATIN CAPITAL LETTER Q */ "=52", /* U+0052 LATIN CAPITAL LETTER R */ "=53", /* U+0053 LATIN CAPITAL LETTER S */ "=54", /* U+0054 LATIN CAPITAL LETTER T */ "=55", /* U+0055 LATIN CAPITAL LETTER U */ "=56", /* U+0056 LATIN CAPITAL LETTER V */ "=57", /* U+0057 LATIN CAPITAL LETTER W */ "=58", /* U+0058 LATIN CAPITAL LETTER X */ "=59", /* U+0059 LATIN CAPITAL LETTER Y */ "=5A", /* U+005A LATIN CAPITAL LETTER Z */ "%5B", /* U+005B LEFT SQUARE BRACKET */ "%5C", /* U+005C REVERSE SOLIDUS */ "%5D", /* U+005D RIGHT SQUARE BRACKET */ "%5E", /* U+005E CIRCUMFLEX ACCENT */ "=5F", /* U+005F LOW LINE */ "=60", /* U+0060 GRAVE ACCENT */ "=61", /* U+0061 LATIN SMALL LETTER A */ "=62", /* U+0062 LATIN SMALL LETTER B */ "=63", /* U+0063 LATIN SMALL LETTER C */ "=64", /* U+0064 LATIN SMALL LETTER D */ "=65", /* U+0065 LATIN SMALL LETTER E */ "=66", /* U+0066 LATIN SMALL LETTER F */ "=67", /* U+0067 LATIN SMALL LETTER G */ "=68", /* U+0068 LATIN SMALL LETTER H */ "=69", /* U+0069 LATIN SMALL LETTER I */ "=6A", /* U+006A LATIN SMALL LETTER J */ "=6B", /* U+006B LATIN SMALL LETTER K */ "=6C", /* U+006C LATIN SMALL LETTER L */ "=6D", /* U+006D LATIN SMALL LETTER M */ "=6E", /* U+006E LATIN SMALL LETTER N */ "=6F", /* U+006F LATIN SMALL LETTER O */ "=70", /* U+0070 LATIN SMALL LETTER P */ "=71", /* U+0071 LATIN SMALL LETTER Q */ "=72", /* U+0072 LATIN SMALL LETTER R */ "=73", /* U+0073 LATIN SMALL LETTER S */ "=74", /* U+0074 LATIN SMALL LETTER T */ "=75", /* U+0075 LATIN SMALL LETTER U */ "=76", /* U+0076 LATIN SMALL LETTER V */ "=77", /* U+0077 LATIN SMALL LETTER W */ "=78", /* U+0078 LATIN SMALL LETTER X */ "=79", /* U+0079 LATIN SMALL LETTER Y */ "=7A", /* U+007A LATIN SMALL LETTER Z */ "%7B", /* U+007B LEFT CURLY BRACKET */ "%7C", /* U+007C VERTICAL LINE */ "%7D", /* U+007D RIGHT CURLY BRACKET */ "%7E", /* U+007E TILDE */ "=7F", /* U+007F */ "=80", /* U+0080 */ "=81", /* U+0081 */ "=82", /* U+0082 */ "=83", /* U+0083 */ "=84", /* U+0084 */ "=85", /* U+0085 */ "=86", /* U+0086 */ "=87", /* U+0087 */ "=88", /* U+0088 */ "=89", /* U+0089 */ "=8A", /* U+008A */ "=8B", /* U+008B */ "=8C", /* U+008C */ "=8D", /* U+008D */ "=8E", /* U+008E */ "=8F", /* U+008F */ "=90", /* U+0090 */ "=91", /* U+0091 */ "=92", /* U+0092 */ "=93", /* U+0093 */ "=94", /* U+0094 */ "=95", /* U+0095 */ "=96", /* U+0096 */ "=97", /* U+0097 */ "=98", /* U+0098 */ "=99", /* U+0099 */ "=9A", /* U+009A */ "=9B", /* U+009B */ "=9C", /* U+009C */ "=9D", /* U+009D */ "=9E", /* U+009E */ "=9F", /* U+009F */ "=A0", /* U+00A0 NO-BREAK SPACE */ "=A1", /* U+00A1 INVERTED EXCLAMATION MARK */ "=A2", /* U+00A2 CENT SIGN */ "=A3", /* U+00A3 POUND SIGN */ "=A4", /* U+00A4 CURRENCY SIGN */ "=A5", /* U+00A5 YEN SIGN */ "=A6", /* U+00A6 BROKEN BAR */ "=A7", /* U+00A7 SECTION SIGN */ "=A8", /* U+00A8 DIAERESIS */ "=A9", /* U+00A9 COPYRIGHT SIGN */ "=AA", /* U+00AA FEMININE ORDINAL INDICATOR */ "=AB", /* U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */ "=AC", /* U+00AC NOT SIGN */ "=AD", /* U+00AD SOFT HYPHEN */ "=AE", /* U+00AE REGISTERED SIGN */ "=AF", /* U+00AF MACRON */ "=B0", /* U+00B0 DEGREE SIGN */ "=B1", /* U+00B1 PLUS-MINUS SIGN */ "=B2", /* U+00B2 SUPERSCRIPT TWO */ "=B3", /* U+00B3 SUPERSCRIPT THREE */ "=B4", /* U+00B4 ACUTE ACCENT */ "=B5", /* U+00B5 MICRO SIGN */ "=B6", /* U+00B6 PILCROW SIGN */ "=B7", /* U+00B7 MIDDLE DOT */ "=B8", /* U+00B8 CEDILLA */ "=B9", /* U+00B9 SUPERSCRIPT ONE */ "=BA", /* U+00BA MASCULINE ORDINAL INDICATOR */ "=BB", /* U+00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ "=BC", /* U+00BC VULGAR FRACTION ONE QUARTER */ "=BD", /* U+00BD VULGAR FRACTION ONE HALF */ "=BE", /* U+00BE VULGAR FRACTION THREE QUARTERS */ "=BF", /* U+00BF INVERTED QUESTION MARK */ "=C0", /* U+00C0 LATIN CAPITAL LETTER A WITH GRAVE */ "=C1", /* U+00C1 LATIN CAPITAL LETTER A WITH ACUTE */ "=C2", /* U+00C2 LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ "=C3", /* U+00C3 LATIN CAPITAL LETTER A WITH TILDE */ "=C4", /* U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS */ "=C5", /* U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE */ "=C6", /* U+00C6 LATIN CAPITAL LETTER AE */ "=C7", /* U+00C7 LATIN CAPITAL LETTER C WITH CEDILLA */ "=C8", /* U+00C8 LATIN CAPITAL LETTER E WITH GRAVE */ "=C9", /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ "=CA", /* U+00CA LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ "=CB", /* U+00CB LATIN CAPITAL LETTER E WITH DIAERESIS */ "=CC", /* U+00CC LATIN CAPITAL LETTER I WITH GRAVE */ "=CD", /* U+00CD LATIN CAPITAL LETTER I WITH ACUTE */ "=CE", /* U+00CE LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ "=CF", /* U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS */ "=D0", /* U+00D0 LATIN CAPITAL LETTER ETH */ "=D1", /* U+00D1 LATIN CAPITAL LETTER N WITH TILDE */ "=D2", /* U+00D2 LATIN CAPITAL LETTER O WITH GRAVE */ "=D3", /* U+00D3 LATIN CAPITAL LETTER O WITH ACUTE */ "=D4", /* U+00D4 LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ "=D5", /* U+00D5 LATIN CAPITAL LETTER O WITH TILDE */ "=D6", /* U+00D6 LATIN CAPITAL LETTER O WITH DIAERESIS */ "=D7", /* U+00D7 MULTIPLICATION SIGN */ "=D8", /* U+00D8 LATIN CAPITAL LETTER O WITH STROKE */ "=D9", /* U+00D9 LATIN CAPITAL LETTER U WITH GRAVE */ "=DA", /* U+00DA LATIN CAPITAL LETTER U WITH ACUTE */ "=DB", /* U+00DB LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ "=DC", /* U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS */ "=DD", /* U+00DD LATIN CAPITAL LETTER Y WITH ACUTE */ "=DE", /* U+00DE LATIN CAPITAL LETTER THORN */ "=DF", /* U+00DF LATIN SMALL LETTER SHARP S */ "=E0", /* U+00E0 LATIN SMALL LETTER A WITH GRAVE */ "=E1", /* U+00E1 LATIN SMALL LETTER A WITH ACUTE */ "=E2", /* U+00E2 LATIN SMALL LETTER A WITH CIRCUMFLEX */ "=E3", /* U+00E3 LATIN SMALL LETTER A WITH TILDE */ "=E4", /* U+00E4 LATIN SMALL LETTER A WITH DIAERESIS */ "=E5", /* U+00E5 LATIN SMALL LETTER A WITH RING ABOVE */ "=E6", /* U+00E6 LATIN SMALL LETTER AE */ "=E7", /* U+00E7 LATIN SMALL LETTER C WITH CEDILLA */ "=E8", /* U+00E8 LATIN SMALL LETTER E WITH GRAVE */ "=E9", /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ "=EA", /* U+00EA LATIN SMALL LETTER E WITH CIRCUMFLEX */ "=EB", /* U+00EB LATIN SMALL LETTER E WITH DIAERESIS */ "=EC", /* U+00EC LATIN SMALL LETTER I WITH GRAVE */ "=ED", /* U+00ED LATIN SMALL LETTER I WITH ACUTE */ "=EE", /* U+00EE LATIN SMALL LETTER I WITH CIRCUMFLEX */ "=EF", /* U+00EF LATIN SMALL LETTER I WITH DIAERESIS */ "=F0", /* U+00F0 LATIN SMALL LETTER ETH */ "=F1", /* U+00F1 LATIN SMALL LETTER N WITH TILDE */ "=F2", /* U+00F2 LATIN SMALL LETTER O WITH GRAVE */ "=F3", /* U+00F3 LATIN SMALL LETTER O WITH ACUTE */ "=F4", /* U+00F4 LATIN SMALL LETTER O WITH CIRCUMFLEX */ "=F5", /* U+00F5 LATIN SMALL LETTER O WITH TILDE */ "=F6", /* U+00F6 LATIN SMALL LETTER O WITH DIAERESIS */ "=F7", /* U+00F7 DIVISION SIGN */ "=F8", /* U+00F8 LATIN SMALL LETTER O WITH STROKE */ "=F9", /* U+00F9 LATIN SMALL LETTER U WITH GRAVE */ "=FA", /* U+00FA LATIN SMALL LETTER U WITH ACUTE */ "=FB", /* U+00FB LATIN SMALL LETTER U WITH CIRCUMFLEX */ "=FC", /* U+00FC LATIN SMALL LETTER U WITH DIAERESIS */ "=FD", /* U+00FD LATIN SMALL LETTER Y WITH ACUTE */ "=FE", /* U+00FE LATIN SMALL LETTER THORN */ "=FF" /* U+00FF LATIN SMALL LETTER Y WITH DIAERESIS */ }; #define IS_SAFE_CHAR(c) ((c == '!') || (c == '$') || \ (c == '*') || (c == '+') || \ (c == '-') || (c == '_') || \ (c >= '0' && c <= '9') || \ (c >= 'A' && c <= 'Z') || \ (c >= 'a' && c <= 'z') ) int encodeChar(const char *str , char **p) { int c = *str; assert(c >= 0 && c < 0x0100); /* in the range of U+0000 - U+00FF */ if ( IS_SAFE_CHAR(c) ) { *p = (char *)str; return 1; } /* translate all other characters */ *p = CharTranslateTbl[c]; return 3; } void encodeUrl(const char *url, char *eurl) { char *src; int len; int i; int isEncoded = 0; char *prefix = "=?UTF-8?Q?"; char *suffix = "?="; /* scan the url to see if it has to be encoded */ for(src= (char *)url; *src; src++) { if(!IS_SAFE_CHAR(*src)) { isEncoded = 1; break; } } if(isEncoded) { /* copy the encoded string prefix */ for(src = prefix; *src; ) { *eurl++ = *src++; } } while(*url) { len = encodeChar(url++, &src); for (i=0;i< len; i++) *eurl++ = *src++; } if(isEncoded) { /* copy the encoded string suffix */ for (src = suffix; *src; ) { *eurl++ = *src++; } } *eurl = '\0'; } static char content_hdr[1024]; #define CONTENT (content_hdr + strlen(content_hdr)) int IncludeContentID = 0; int ContentID=0; static void setup_content_hdr (refobj_t *obj_p) { char encodedUrlStr[1024]; char buf[41]; int offset = 0; int len; int size; int i; char *src, *dest; content_hdr[0] = '\0'; sprintf(CONTENT, "Content-Type: %s\r\n", obj_p->ext_p->type_p); if(encodeHeaders) { encodeUrl(obj_p->url_p, encodedUrlStr); if (strlen(encodedUrlStr) > 40) { /* fold url */ sprintf(CONTENT, "Content-Location:"); len = strlen(encodedUrlStr); offset = 0; while (len > 0) { if(len < 40) size = len; else size = 40; for (i=0,src=encodedUrlStr+offset,dest=buf;iurl_p); } if(IncludeContentID) sprintf(CONTENT, "Content-ID: <%s-%d>\r\n", obj_p->url_p, obj_p->id); if(Content_Disposition) sprintf(CONTENT, "Content-Disposition: %s\r\n", Content_Disposition); sprintf(CONTENT, "\r\n"); obj_p->content_hdr_len = strlen(content_hdr); obj_p->content_hdr_p = malloc(obj_p->content_hdr_len + 1); strcpy(obj_p->content_hdr_p, content_hdr); } #undef CONTENT /* get content type based on filename extension */ static const extension_t *match_extension (const char *filename_p) { int idx; int len = strlen(filename_p); /* for each know file extension */ for (idx = 0; extensions[idx].ext_p != NULL; ++idx) { const char *cur_p, *next_p; cur_p = extensions[idx].ext_p; do { int tlen; next_p = strchr(cur_p, ';'); if (next_p == NULL) tlen = strlen(cur_p); else (tlen = next_p - cur_p), ++next_p; if (tlen < len && !strncmp(filename_p + len - tlen, cur_p, tlen)) return (&extensions[idx]); } while ((cur_p = next_p) != NULL); } return (&extensions[idx]); } const char *scan_file_p; /* load file into memory */ static int load_file (const char *filename_p, unsigned char **data_pp, unsigned int *data_len_p) { FILE *file_p; int len = strlen(filename_p); char *fn_p = malloc(len+1); strcpy(fn_p, filename_p); (void)URL_convert_to_plain((unsigned char*)filename_p, (unsigned char*)fn_p, len+1); /* assuming file accessable, local file system? */ file_p = fopen(fn_p, "rb"); /* read, binary */ if (file_p == NULL) { /* open failed */ free(fn_p); return (FALSE); } /* get size of file and allocate memory to hold it */ fseek(file_p, 0, SEEK_END); *data_len_p = ftell(file_p); *data_pp = malloc(*data_len_p + 1); if (*data_pp == NULL) /* alloc failed */ return (FALSE); /* return to beginning of file and read it in */ fseek(file_p, 0, SEEK_SET); fread(*data_pp, 1, *data_len_p, file_p); (*data_pp)[*data_len_p] = '\0'; assert(*data_pp != NULL); /* clean up */ fclose(file_p); free(fn_p); return (TRUE); } /* skip whitespace */ static void skip_ws (const char **str_pp) { const char *str_p = *str_pp; while (isspace(*str_p)) ++str_p; *str_pp = str_p; } static const char *filename_base_p; static char filename_buf[256]; refobj_t *scan_obj_p, *root_obj_p; static const char *build_filename (const char *name_p) { char *p; if(strncmp(name_p, "cid:", 4) == 0 ) { return name_p+4; } if(URL_abs_e == URL_check_url((unsigned char*)name_p)) { p = strchr(name_p, '/'); return (p+1); } else { sprintf(filename_buf, "%s/%s", filename_base_p, name_p); return (filename_buf); } } /* scan objects */ static void scan_objs (void) { list_node_t *node_p; refobj_t * obj_p; int scanned; do { scanned = FALSE; /* traverse list of referenced objects */ for(node_p = refobjs.next_p; node_p != &refobjs; node_p = node_p->next_p) { scan_obj_p = (refobj_t *)node_p->data; obj_p = scan_obj_p; scan_file_p = (const char*) scan_obj_p->file_p; if (obj_p->scanned) { continue; } obj_p->scanned = TRUE; if (obj_p->ext_p->scan != NULL) { fprintf(stderr, "Scanning URL: %s%s\n", obj_p->url_p, obj_p->root ? " (root)" : ""); (obj_p->ext_p->scan)(); scanned = TRUE; break; } } } while (scanned); } /* locate object based on url */ static refobj_t *find_url (const char *url_p) { list_node_t *node_p; refobj_t *obj_p; /* traverse list */ for (node_p = refobjs.next_p; node_p != &refobjs; node_p = node_p->next_p) { obj_p = (refobj_t *)node_p->data; if (strcmp(obj_p->url_p, url_p) == 0) return (obj_p); } return (NULL); } static int obj_cnt; static int msg_number; static int get_obj_len (refobj_t *obj_p) { return (obj_p->file_len + obj_p->content_hdr_len); } #define MAXBUFF 4*1024 /* add a URL as a reference */ static void add_url (const char *start_p, const char *end_p, int root, int offs) { refobj_t *obj_p; list_node_t* node_p; unsigned char *file_p; unsigned int file_len; char *url_p = (char*)start_p; int len = end_p - start_p; unsigned char buffer[MAXBUFF]; unsigned char normalized_url[MAXBUFF]; if (url_p == NULL || strlen(url_p) == 0) { fprintf(stderr, "Didn't add URL (no string)\n"); return; } memcpy(buffer, url_p, len); buffer[len] = '\0'; len = URL_convert_to_plain(buffer, normalized_url, MAXBUFF); url_p = (char *)normalized_url; if (root) { const char *last_slash_p = url_p; while (strchr(last_slash_p + 1, '/') != NULL) last_slash_p = strchr(last_slash_p + 1, '/'); if (url_p < last_slash_p) { char *base_p = malloc(last_slash_p - url_p + 1); memcpy(base_p, url_p, last_slash_p - url_p); base_p[last_slash_p - url_p] = '\0'; filename_base_p = base_p, url_p = (char*)last_slash_p + 1; } else filename_base_p = "./"; fprintf(stderr, "Filename base: %s\n", filename_base_p); } else { char *temp_p; url_p = malloc(len+1); assert(url_p); memcpy(url_p, normalized_url, len+1); url_p[len] = '\0'; if(defaultBaseP != NULL) { if(URL_illegal_e != URL_check_url((unsigned char*)url_p)) { len = strlen(defaultBaseP) + len + 1; temp_p = malloc(len); /* check if this is a cid or http scheme */ if (strcmp(defaultBaseP, "cid:") == 0) { /* cid scheme, just copy the base and content id */ strcpy(temp_p, defaultBaseP); strcpy(temp_p+4,url_p); } else { /* http scheme */ len = URL_combine_abs_and_rel((unsigned char *)defaultBaseP, (unsigned char *)url_p, (unsigned char *)temp_p, len); } free(url_p); url_p = temp_p; } } } if (find_url(url_p) && OneObj) { fprintf(stderr, "URL already referenced: %s\n", url_p); return; } if (!load_file(build_filename(url_p), &file_p, &file_len)) { fprintf(stderr, "Failed to add URL: %s%s - no file.\n", url_p, root ? " (root)" : ""); return; } /* if parent object is root object */ if (scan_obj_p && (root_obj_p == scan_obj_p)) { if (ForceBeforeRoot) offs = 0; } obj_p = (refobj_t*) malloc(sizeof(refobj_t)); obj_p->url_p = url_p; obj_p->ext_p = match_extension(obj_p->url_p); obj_p->scanned = FALSE; obj_p->root = root; obj_p->parent_obj_p = scan_obj_p; obj_p->parent_offs = offs + (scan_obj_p != NULL ? scan_obj_p->content_hdr_len : 0); obj_p->file_p = file_p; obj_p->file_len = file_len; obj_p->id = ++obj_cnt; obj_p->output_offs = 0; setup_content_hdr(obj_p); list_init(&(obj_p->children)); if (obj_p->parent_obj_p != NULL) { /* add obj to the parent's list of children */ node_p = (list_node_t *)malloc (sizeof(list_node_t)); assert(node_p); node_p->data = (void *)obj_p; list_insert_before(&obj_p->parent_obj_p->children, node_p); } fprintf(stderr, "Added URL: %s%s; type=\"%s\" (parent offset %d)\n", url_p, root ? " (root)" : "", obj_p->ext_p->type_p, obj_p->parent_offs); if (root) root_obj_p = obj_p; node_p = (list_node_t *)malloc(sizeof(list_node_t)); assert(node_p); node_p->data = (void *)obj_p; list_insert_before(&refobjs, node_p); } static int match_tag (const char *str_p, const char *tag_id_p, const char **tag_pp, int *tag_len_p) { const char *end_p; if (*str_p == '<' && strncmp(str_p + 1, tag_id_p, strlen(tag_id_p)) == 0 && (isspace(str_p[strlen(tag_id_p) + 1]) || str_p[strlen(tag_id_p) + 1] == '>' || strncmp("/>", str_p + strlen(tag_id_p) + 1, 2) == 0) && (end_p = strchr(str_p, '>')) != NULL) { *tag_pp = str_p - 1; *tag_len_p = end_p + 1 - (str_p - 1); return (TRUE); } else return (FALSE); } static int match_end_tag (const char *str_p, const char *tag_id_p, const char **tag_pp, int *tag_len_p) { if (strncmp(str_p, "') { *tag_pp = str_p; *tag_len_p = str_p + strlen(tag_id_p) + 3 - str_p; return (TRUE); } else return (FALSE); } static int find_attr (const char *attr_id_p, const char *tag_p, int tag_len, const char **attr_pp, int *attr_len_p) { const char *attr_p, *attr_end_p; char ch = tag_p[tag_len]; ((char*) tag_p)[tag_len] = '\0'; if ((attr_p = strstr(tag_p, attr_id_p)) != NULL && strncmp("=\"", attr_p += strlen(attr_id_p), 2) == 0 && (attr_end_p = strchr(attr_p += 2, '\"')) != NULL) { *attr_pp = attr_p; *attr_len_p = attr_end_p - attr_p; } else attr_p = NULL; ((char*) tag_p)[tag_len] = ch; return (attr_p != NULL); } /* find tag content */ static int find_tag_content (const char *str_p, const char *tag_id_p, const char **content_pp, int *content_len) { const char *tag_p; const char *tag_end_p; const char *next_p; int tag_len; int tag_end_len; while ((str_p = strchr(str_p, '<')) != NULL) { if (match_tag(str_p, "style", &tag_p, &tag_len)) { next_p = str_p; while ((next_p = strstr(next_p, " 0) new_str_p = malloc(len + 1), len = 0; while (*str_p != q) { char ch = *str_p++; if (ch == '\\') { ch = *str_p++; if (new_str_p != NULL) new_str_p[len] = ch; } else if (new_str_p != NULL) new_str_p[len] = ch; len++; } } while (new_str_p == NULL); *in_pp = str_p + 1; if(new_str_p) new_str_p[len] = '\0'; return (new_str_p); } /* get url string */ static const char *get_url (const char **in_pp) { const char *str_p = *in_pp; const char *start_p = str_p; const char *end_p = str_p; char *new_str_p; while (*str_p != ')') { while (!isspace(*str_p) && *str_p != ')') str_p++; end_p = str_p; skip_ws(&str_p); } *in_pp = end_p; new_str_p = malloc(end_p - start_p + 1); assert(new_str_p); memcpy(new_str_p, start_p, end_p - start_p); new_str_p[end_p - start_p] = '\0'; return (new_str_p); } static int find_prop(const char *prop_id_p, const char *tag_p, int tag_len, const char **prop_pp, int *prop_len_p) { const char *prop_p, *prop_end_p; char ch = tag_p[tag_len]; ((char*) tag_p)[tag_len] = '\0'; if ((prop_p = strstr(tag_p, prop_id_p)) != NULL && (strncmp("(", prop_p += strlen(prop_id_p), 1) == 0) && (prop_end_p = strchr(prop_p += 1, ')')) != NULL) { *prop_pp = prop_p; *prop_len_p = prop_end_p - prop_p; } else prop_p = NULL; ((char*) tag_p)[tag_len] = ch; return (prop_p != NULL); } /* scan CSS content for URLs */ static void scan_css (const char *style_p, const char *ref_point_p) { const char *p, *url_p; for (p = (const char*) style_p; p < ref_point_p && (*p != '<' && *p != '/' && *p != 's'); p++) { if ((p+2 <= ref_point_p) && *p == 'u' && *(p+1) == 'r' && *(p+2) == 'l') { p += 4; /* lenght of "url(" */ skip_ws(&p); if (*p == '\'' || *p == '\"') { url_p = get_qstring(&p); } else { url_p = get_url(&p); } skip_ws(&p); if (*p++ == ')') { if (url_p != NULL) add_url(url_p, (url_p + strlen(url_p) + 1), FALSE, ref_point_p - scan_file_p); } } } } /* scan css file for external references */ static void scan_css_file (void) { scan_css((const char*) scan_obj_p->file_p, (const char*) scan_obj_p->file_p + scan_obj_p->file_len); } /* scan JPEG content */ static void scan_jpeg_file (void) { unsigned char *data_p = scan_obj_p->file_p, type; unsigned int len = scan_obj_p->file_len; unsigned int offset = 0; int done = FALSE; do { if (data_p[offset] != 0xff) { fprintf(stderr, "scan_jpeg_file: %s - unexpected byte 0x%02x at offset %d -- looking for marker\n", scan_obj_p->url_p, data_p[offset], offset); ++offset; continue; } offset++; switch ((type = data_p[offset++])) { case 0xda: { unsigned int len = data_p[offset] << 8 | data_p[offset + 1]; offset += len; done = TRUE; break; } case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: case 0xc6: case 0xc7: case 0xc9: case 0xca: case 0xcb: case 0xcd: case 0xce: case 0xcf: /* app markers */ case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe4: case 0xe5: case 0xe6: case 0xe7: case 0xe8: case 0xe9: case 0xea: case 0xeb: case 0xec: case 0xed: case 0xee: case 0xef: /* comment */ case 0xff: /* DAC */ case 0xcc: case 0xde: case 0xc4: case 0x01: case 0xd9: case 0xdb: case 0xfe: { unsigned int len = data_p[offset] << 8 | data_p[offset + 1]; offset += len; break; } case 0xf0: case 0xf1: case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf6: case 0xf7: case 0xf8: case 0xf9: case 0xfa: case 0xfb: case 0xfc: case 0xfd: case 0xc8: case 0xd8: break; case 0xdc: case 0xdd: offset += 4; break; case 0xdf: offset += 3; break; case 0x00: break; default: break; } } while (!done && offset < len); if (done) scan_obj_p->img_hdr_size = offset; else scan_obj_p->img_hdr_size = 1024; fprintf(stderr, "%s : image hdr size estimated %d bytes\n", scan_obj_p->url_p, scan_obj_p->img_hdr_size); } /* scan root xhtml for referenced data - * images/objects and external style sheets */ static void scan_xhtml (const char *str_p) { char *tag_p; char *next_p; char * p; int tag_len; int offset; /* we need to find tags and scan * within that area for @import followed by the URL */ for (next_p = (char*) str_p; find_tag_content(next_p, "style", &tag_p, &tag_len); next_p = tag_p + tag_len) { assert(tag_p[tag_len] == '<'); /* do a simplistic search for url */ tag_p[tag_len] = '\0'; scan_css(tag_p, tag_p + tag_len + 9); /* 9 is for " " */ tag_p[tag_len] = '<'; } while (str_p != NULL && *str_p != '\0') { if ((str_p = strchr(str_p, '<')) == NULL) break; /* check each start tag for a style property containing a url */ if ((*(str_p+1) != '/') && (*(str_p+1) != '?') && (*(str_p+1) != '!')){ /* this is a start tag, not an end tag or procesing instruction */ tag_p = (char *)(str_p+1); skip_ws(&tag_p); /* don't look in style elements, that's been done already */ if(strncmp(tag_p, "style", 5) != 0) { for (tag_len = 0,p = tag_p; *p != '>' ;p++) tag_len++; offset = tag_p + tag_len - scan_file_p; next_p = tag_p + tag_len; if(find_attr("style", tag_p, tag_len, &tag_p, &tag_len)) { if (find_prop("url", tag_p, tag_len, &tag_p, &tag_len)) { add_url(tag_p, tag_p + tag_len, FALSE, offset); } } } } if (match_tag(str_p, "img", &tag_p, &tag_len)) { offset = tag_p + tag_len - scan_file_p; next_p = tag_p + tag_len; if (find_attr("src", tag_p, tag_len, &tag_p, &tag_len)) add_url(tag_p, tag_p + tag_len, FALSE, offset); str_p = tag_p + tag_len; } else if (match_tag(str_p, "object", &tag_p, &tag_len)) { offset = tag_p + tag_len - scan_file_p; next_p = tag_p + tag_len; if (find_attr("data", tag_p, tag_len, &tag_p, &tag_len)) add_url(tag_p, tag_p + tag_len, FALSE, offset); str_p = tag_p + tag_len; } else if (match_tag(str_p, "link", &tag_p, &tag_len)) { offset = tag_p + tag_len - scan_file_p; next_p = tag_p + tag_len; if (find_attr("href", tag_p, tag_len, &tag_p, &tag_len)) add_url(tag_p, tag_p + tag_len, FALSE, offset); str_p = tag_p + tag_len; } else if (match_tag(str_p, "base", &tag_p, &tag_len)) { offset = tag_p + tag_len - scan_file_p; next_p = tag_p + tag_len; if (find_attr("href", tag_p, tag_len, &tag_p, &tag_len)) { char *url_p = malloc(tag_len+1); memcpy(url_p, tag_p, tag_len); url_p[tag_len] = '\0'; if(URL_abs_e == URL_check_url((unsigned char*)url_p)) { char *temp_p = malloc(tag_len + 1); int len = URL_extract_base((unsigned char*)url_p, (unsigned char*)temp_p, tag_len+1); if(len >= 0) { free(defaultBaseP); defaultBaseP = temp_p; } else { free(temp_p); } } } str_p = tag_p + tag_len; } else str_p = strchr(str_p, '>'); } } /* scan root xhtml for referenced data - * images/objects and external style sheets */ static void scan_xhtml_file (void) { scan_xhtml((const char*) scan_obj_p->file_p); } static int is_in_group (interleave_group_t *group_p, refobj_t *obj_p) { int i; for (i = 0; i < group_p->urls_cnt; i++) { if (strcmp(group_p->urls_p[i], obj_p->url_p) == 0) break; } return (i < group_p->urls_cnt); } static refobj_t *get_group_obj (interleave_group_t *group_p, int i) { refobj_t *obj_p; list_node_t *node_p; /* traverse list */ for ( node_p = refobjs.next_p; node_p != &refobjs; node_p = node_p->next_p) { obj_p = (refobj_t*)node_p->data; if (strcmp(obj_p->url_p, group_p->urls_p[i]) == 0) return (obj_p); } return (NULL); } static int query_interleave_size (refobj_t *obj_p, int *chunk_size_p) { interleave_group_t *group_p; list_node_t *node_p; /* find the object URL in an interleave group */ /* traverse list */ for (node_p = interleave_groups.next_p; node_p != &interleave_groups; node_p = node_p->next_p) { group_p = (interleave_group_t*)node_p->data; if (is_in_group(group_p, obj_p)) { int smallest_size = 0, i; /* determine chunk size based on size * relative to smallest object */ for (i = 0; i < group_p->urls_cnt; i++) { refobj_t *group_obj_p = get_group_obj(group_p, i); if (group_obj_p != NULL) { int size = get_obj_len(group_obj_p) - group_obj_p->img_hdr_size; if (size < smallest_size || i == 0) smallest_size = size; } } *chunk_size_p = (get_obj_len(obj_p) - obj_p->img_hdr_size) * ChunkSize / smallest_size; return (TRUE); } } return (FALSE); } static void image_get_chunk_seq (refobj_t *obj_p, chunk_seq_t *seq_p) { int chunk_size; int parent_offs = MIN(obj_p->parent_offs + 8 * 1024, get_obj_len(obj_p->parent_obj_p)); if (!query_interleave_size(obj_p, &chunk_size)) { seq_p->chunks_p = (chunk_node_t*) malloc(sizeof(chunk_node_t) * 2); assert(seq_p->chunks_p); seq_p->chunks_cnt = 2; seq_p->chunks_p[0].parent_offs = obj_p->parent_offs; seq_p->chunks_p[0].length = obj_p->img_hdr_size + obj_p->content_hdr_len; seq_p->chunks_p[1].parent_offs = parent_offs; seq_p->chunks_p[1].length = get_obj_len(obj_p) - seq_p->chunks_p[0].length; } else { int size = get_obj_len(obj_p) - obj_p->img_hdr_size - obj_p->content_hdr_len; int num_chunks = (size + chunk_size - 1) / chunk_size + 1; int i; seq_p->chunks_p = (chunk_node_t*) malloc(sizeof(chunk_node_t) * num_chunks); assert(seq_p->chunks_p); seq_p->chunks_cnt = num_chunks; seq_p->chunks_p[0].parent_offs = obj_p->parent_offs; seq_p->chunks_p[0].length = obj_p->img_hdr_size + obj_p->content_hdr_len; for (i = 1; i < num_chunks; i++) { seq_p->chunks_p[i].parent_offs = parent_offs; seq_p->chunks_p[i].length = MIN(chunk_size, size); size -= seq_p->chunks_p[i].length; } assert(size == 0); } } /* get parent offset for images - typically we * will want to output 512 bytes (headers) of the * image data very close to the reference. We'd want * to output the remainder approximate "one page" * later */ static int image_get_parent_offs (refobj_t *obj_p) { if (obj_p->output_offs < get_obj_len(obj_p)) { if (obj_p->output_offs < obj_p->img_hdr_size + obj_p->content_hdr_len) return (obj_p->parent_offs); else return (obj_p->parent_offs + 4 * 1024); } else return (-1); } static int image_get_chunk_size (refobj_t *obj_p) { if (obj_p->output_offs < obj_p->img_hdr_size + obj_p->content_hdr_len) return (MIN(obj_p->img_hdr_size + obj_p->content_hdr_len - obj_p->output_offs, get_obj_len(obj_p) - obj_p->output_offs)); else return (get_obj_len(obj_p) - obj_p->output_offs); } static refobj_t *find_first_child (refobj_t *obj_p, int offs, int *child_offs_p) { refobj_t *first_p = NULL; refobj_t *child_p; list_node_t *node_p; int first_offs = -1; int child_offs; /* traverse list */ for(node_p = obj_p->children.next_p; node_p != &obj_p->children; node_p = node_p->next_p) { child_p = (refobj_t *)node_p->data; if (child_p->output_offs == get_obj_len(child_p)) continue; if (child_p->ext_p->get_parent_offs != NULL) child_offs = (child_p->ext_p->get_parent_offs)(child_p); else child_offs = child_p->output_offs < get_obj_len(child_p) ? child_p->parent_offs : -1; if (child_offs >= 0 && (first_p == NULL || child_offs < first_offs)) first_p = child_p, first_offs = child_offs; } if (ForceBeforeRoot) first_offs = 0; *child_offs_p = first_offs > 0 ? obj_p->content_hdr_len + first_offs : 0; return (first_p); } /* output a chunk */ static void output_multiplex_chk (FILE *out_p, refobj_t *obj_p, int chunk_size) { if(obj_p->output_offs == 0) obj_p->msg = ++msg_number; fprintf(out_p, "CHK %d %d %s\r\n", (reuseMsgNumbers ? obj_p->msg : obj_p->id), chunk_size, obj_p->output_offs + chunk_size < get_obj_len(obj_p) ? "MORE" : "LAST"); if (obj_p->output_offs + chunk_size >= get_obj_len(obj_p)) --msg_number; if (obj_p->output_offs < obj_p->content_hdr_len) { int size = MIN(obj_p->content_hdr_len - obj_p->output_offs, chunk_size); fwrite(obj_p->content_hdr_p + obj_p->output_offs, 1, size, out_p); obj_p->output_offs += size; chunk_size -= size; } if (chunk_size > 0) { assert(chunk_size == MIN(chunk_size, get_obj_len(obj_p) - obj_p->output_offs)); fwrite(obj_p->file_p + obj_p->output_offs - obj_p->content_hdr_len, 1, chunk_size, out_p); obj_p->output_offs += chunk_size; } fprintf(out_p, "\r\n"); } static list_node_t chunk_seqs; static void prep_chunk_seqs (void) { list_node_t *node_p; refobj_t *obj_p; chunk_seq_t *seq_p; list_node_t *nu_seq; int i, size; /* foreach object create a chunk sequence */ list_init((&(chunk_seqs))); /* traverse list */ for(node_p = refobjs.next_p; node_p != &refobjs; node_p = node_p->next_p) { obj_p = (refobj_t*)node_p->data; seq_p = (chunk_seq_t*) malloc(sizeof(chunk_seq_t)); assert(seq_p); seq_p->obj_p = obj_p; seq_p->output_idx = 0; seq_p->output_offs = 0; if (obj_p->ext_p->get_chunk_seq != NULL) (obj_p->ext_p->get_chunk_seq)(obj_p, seq_p); else { seq_p->chunks_p = (chunk_node_t*) malloc(sizeof(chunk_node_t) * 1); assert(seq_p->chunks_p); seq_p->chunks_cnt = 1; seq_p->chunks_p[0].parent_offs = obj_p->parent_offs; seq_p->chunks_p[0].length = get_obj_len(obj_p); } for (i = 0, size = 0; i < seq_p->chunks_cnt; i++) size += seq_p->chunks_p[i].length; assert(size == get_obj_len(seq_p->obj_p)); nu_seq = (list_node_t *)malloc(sizeof(list_node_t)); assert(nu_seq); nu_seq->data = (void *)seq_p; list_insert_before(&chunk_seqs, nu_seq); } } static void destroy_chunk_seqs (void) { chunk_seq_t *seq_p; list_node_t *node_p; /* destroy chunk sequences */ while (chunk_seqs.prev_p != &chunk_seqs) { node_p = chunk_seqs.next_p; node_p = list_unlink(node_p); seq_p = (chunk_seq_t *)node_p->data; free(node_p); if(seq_p) { free(seq_p->chunks_p); free(seq_p); } } } static int find_next_child_offset (refobj_t *obj_p, int old_offs) { int new_offs = get_obj_len(obj_p); list_node_t *node_p; chunk_seq_t *seq_p; int parent_offs; for(node_p = chunk_seqs.next_p; node_p != &chunk_seqs; node_p = node_p->next_p) { seq_p = (chunk_seq_t *)node_p->data; if (seq_p->obj_p->parent_obj_p == obj_p) { if (seq_p->output_idx < seq_p->chunks_cnt) { parent_offs = seq_p->chunks_p[seq_p->output_idx].parent_offs; if (parent_offs >= old_offs && parent_offs < new_offs) new_offs = parent_offs; } } } return (new_offs); } static int find_child_chunk (refobj_t *parent_p, int offs, int offs_idx, chunk_seq_t **v_seq_p, chunk_node_t **v_chunk_p) { list_node_t *node_p; chunk_seq_t *seq_p; int i, cnt; for(node_p = chunk_seqs.next_p; node_p != &chunk_seqs; node_p = node_p->next_p) { seq_p = (chunk_seq_t *)node_p->data; if (seq_p->obj_p->parent_obj_p != parent_p) continue; /* set i */ for(i = 0; i < seq_p->chunks_cnt && seq_p->chunks_p[i].parent_offs < offs; i++) /* move to next */ ; for(cnt = offs_idx; (cnt > 0 && i < seq_p->chunks_cnt && seq_p->chunks_p[i].parent_offs == offs); i++, cnt--) /* move to next */; if (i >= seq_p->output_idx && i < seq_p->chunks_cnt && seq_p->chunks_p[i].parent_offs == offs) { *v_seq_p = seq_p; *v_chunk_p = &seq_p->chunks_p[i]; return (TRUE); } } return (FALSE); } static void output_multiplex_obj (FILE *out_p, refobj_t *obj_p, int length) { int offs; int offs_idx; int next_offs; int progress; int end_offs; chunk_seq_t *seq_p; chunk_node_t *chunk_p; offs = obj_p->output_offs; end_offs = offs + length; offs_idx = 0; do { progress = FALSE; next_offs = find_next_child_offset(obj_p, offs); if (offs < next_offs) { if (offs == end_offs) break; if (next_offs > end_offs) next_offs = end_offs; output_multiplex_chk(out_p, obj_p, next_offs - offs); progress = TRUE; offs = next_offs; offs_idx = 0; } else { /* attempt to find next chunk */ progress = find_child_chunk(obj_p, offs, offs_idx, &seq_p, &chunk_p); if (!progress) { offs_idx++; progress = find_child_chunk(obj_p, offs, offs_idx, &seq_p, &chunk_p); } if (progress) { output_multiplex_obj(out_p, seq_p->obj_p, chunk_p->length); seq_p->output_idx++; } } } while (progress); } /* output multiplexed */ static void output_multiplex (FILE *out_p) { /* output header */ fprintf(out_p, "Content-Type: %s; type=%s\r\n", TYPE_MULTIPLEXED, root_obj_p->ext_p->type_p); fprintf(out_p, "\r\n"); prep_chunk_seqs(); output_multiplex_obj (out_p, root_obj_p, get_obj_len(root_obj_p)); destroy_chunk_seqs(); fprintf(out_p, "CHK 0 0 LAST\r\n"); } /* match assignment option */ static const char *match_opt_assign (int *argc_p, char *argv_p[], const char *opt_p) { int argc = *argc_p, i; const char *val_p; for (i = 1; i < argc; i++) { if (strncmp(argv_p[i], opt_p, strlen(opt_p)) == 0) { val_p = argv_p[i] + strlen(opt_p); for (i++; i < argc; i++) argv_p[i-1] = argv_p[i]; *argc_p = argc - 1; return (val_p); } } return (NULL); } /* match assignment option */ static int find_opt (int *argc_p, char *argv_p[], const char *opt_p) { int argc = *argc_p, i; const char *val_p; for (i = 1; i < argc; i++) { if (strcmp(argv_p[i], opt_p) == 0) { val_p = argv_p[i] + strlen(opt_p); for (i++; i < argc; i++) argv_p[i-1] = argv_p[i]; *argc_p = argc - 1; return (TRUE); } } return (FALSE); } /* init */ static void init (void) { list_init(&(refobjs)); scan_file_p = NULL; scan_obj_p = NULL; root_obj_p = NULL; filename_base_p = NULL; obj_cnt = 0; msg_number = 0; } /* cleanup */ static void cleanup (void) { list_node_t *node_p; refobj_t *obj_p ; interleave_group_t *group_p; while (refobjs.next_p != &refobjs) { node_p = refobjs.next_p; list_unlink(node_p); obj_p = (refobj_t*)node_p->data; free(node_p); free(obj_p->file_p); free(obj_p->content_hdr_p); free(obj_p); } while(interleave_groups.next_p != &interleave_groups) { node_p = interleave_groups.next_p; list_unlink(node_p); group_p = (interleave_group_t *)node_p->data; free(node_p); free(group_p->urls_p[0]); free(group_p->urls_p); free(group_p); } if (defaultBaseP) free(defaultBaseP); } static void create_interleave_group (const char *urls_p) { interleave_group_t *group_p; list_node_t * node_p; int cnt; int len; const char *str_p, *next_p; char *buf_p; /* count the number of URLs and string length */ str_p = urls_p; cnt = 0; len = 0; do { next_p = strchr(str_p, ','); cnt++; if (next_p != NULL) len += (next_p - str_p) + 1; else len += strlen(str_p) + 1; if (next_p != NULL) str_p = next_p + 1; else str_p = NULL; } while (str_p != NULL); group_p = (interleave_group_t*) malloc(sizeof(interleave_group_t)); assert(group_p); group_p->urls_cnt = cnt; group_p->urls_p = (char**) malloc(sizeof(const char*) * cnt); assert(group_p->urls_p); buf_p = (char*) malloc(len); str_p = urls_p; cnt = 0; do { next_p = strchr(str_p, ','); if (next_p != NULL) len = (next_p - str_p) + 1; else len = strlen(str_p) + 1; group_p->urls_p[cnt] = buf_p; strncpy(group_p->urls_p[cnt], str_p, len - 1); group_p->urls_p[cnt][len - 1] = '\0'; buf_p += len; cnt++; if (next_p != NULL) str_p = next_p + 1; else str_p = NULL; } while (str_p != NULL); node_p = (list_node_t*)malloc(sizeof(list_node_t)); node_p->data = (void *)group_p; list_insert_before(&interleave_groups, node_p); } /* main */ int main (int argc, char *argv_p[]) { const char *base_opt_p; const char *output_prefix_p; int i, base_len; int to_stdout = TRUE; int base_on = TRUE; int content_disp_option = FALSE; const char *group_p; list_init((&(interleave_groups))); defaultBaseP = NULL; if((base_opt_p = match_opt_assign(&argc, argv_p, "-base=")) != NULL) { base_len = strlen(base_opt_p) + 1; if(URL_abs_e != URL_check_url((unsigned char*)base_opt_p)) { fprintf(stderr, "Failed to set base uri (%s)\n", base_opt_p); base_opt_p = NULL; } } if(find_opt(&argc, argv_p, "-encode")) encodeHeaders = TRUE; Content_Disposition = (char *)match_opt_assign(&argc, argv_p, "-content-disp="); if (find_opt(&argc, argv_p, "-cid")) IncludeContentID = 1; reuseMsgNumbers = find_opt(&argc, argv_p, "-reusemsg"); ForceBeforeRoot = find_opt(&argc, argv_p, "-force-before-root"); OneObj = find_opt(&argc, argv_p, "-one-obj"); output_prefix_p = match_opt_assign(&argc, argv_p, "-output-prefix="); /* extract interleave groups */ while ((group_p = match_opt_assign(&argc, argv_p, "-interleave=")) != NULL) { /* group should be a comma seperated list of objects */ create_interleave_group(group_p); } to_stdout = (argc == 2); if (argc > 1 && !strcmp(argv_p[argc - 1], "-")) --argc, to_stdout = TRUE; for (i = 1; i < argc; i++) { init(); add_url(argv_p[i], argv_p[i] + strlen(argv_p[i]), TRUE, 0); if(base_opt_p) { defaultBaseP = malloc(base_len); URL_extract_base((unsigned char*)base_opt_p, (unsigned char*)defaultBaseP, base_len); } if (root_obj_p) { FILE *out_p = stdout; scan_objs(); if (!to_stdout) { char output_name[512]; sprintf(output_name, "%s%s%s.%s", output_prefix_p ? output_prefix_p : "", (output_prefix_p && output_prefix_p[strlen(output_prefix_p) - 1] != '/') ? "/" : "", argv_p[i], "mx"); if ((out_p = fopen(output_name, "wb")) == NULL) fprintf(stderr, "Failed to open output file (%s)\n", output_name); } if (out_p) { #if defined (_WIN32) int result; result = _setmode(_fileno(stdout), _O_BINARY); if(result == -1) perror("Cannot set binary mode, using default text mode for stdout."); #endif output_multiplex(out_p); fflush(out_p); if (!to_stdout) fclose(out_p); } } else fprintf(stderr, "No root obj?! (%s)\n", argv_p[i]); if(defaultBaseP) { free(defaultBaseP); defaultBaseP = NULL; } cleanup(); } return (0); } struct EntityData { char * entity_name; int codept; }; struct EntityData EntityDataLookupTable[] = { "aacute", 225, // latin small letter a with acute, U+00E1 ISOlat1 "acirc", 226, // latin small letter a with circumflex, U+00E2 ISOlat1 "acute", 180, // acute accent = spacing acute, U+00B4 ISOdia "aelig", 230, // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 "agrave", 224, // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 "alefsym", 8501, // alef symbol = first transfinite cardinal, U+2135 NEW "alpha", 945, // greek small letter alpha, U+03B1 ISOgrk3 "amp", 38, // ampersand, U+0026 ISOnum "and", 8743, // logical and = wedge, U+2227 ISOtech "ang", 8736, // angle, U+2220 ISOamso "apos", 39, // apostrophe = APL quote, U+0027 ISOnum "aring", 229, // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 "asymp", 8776, // almost equal to = asymptotic "atilde", 227, // latin small letter a with tilde, U+00E3 ISOlat1 "auml", 228, // latin small letter a with diaeresis, U+00E4 ISOlat1 "bdquo", 8222, // double low-9 quotation mark, U+201E NEW "beta", 946, // greek small letter beta, U+03B2 ISOgrk3 "brvbar", 166, // broken bar = broken vertical bar, U+00A6 ISOnum "bull", 8226, // bullet = black small circle, U+2022 ISOpub "cap", 8745, // intersection = cap, U+2229 ISOtech "ccedil", 231, // latin small letter c with cedilla, U+00E7 ISOlat1 "cedil", 184, // cedilla = spacing cedilla, U+00B8 ISOdia "cent", 162, // cent sign, U+00A2 ISOnum "chi", 967, // greek small letter chi, U+03C7 ISOgrk3 "circ", 710, // modifier letter circumflex accent, U+02C6 ISOpub "clubs", 9827, // black club suit = shamrock, U+2663 ISOpub "cong", 8773, // approximately equal to, U+2245 ISOtech "copy", 169, // copyright sign, U+00A9 ISOnum "crarr", 8629, // downwards arrow with corner leftwards = carriage return, U+21B5 NEW "cup", 8746, // union = cup, U+222A ISOtech "curren", 164, // currency sign, U+00A4 ISOnum "dArr", 8659, // downwards double arrow, U+21D3 ISOamsa "dagger", 8224, // dagger, U+2020 ISOpub "darr", 8595, // downwards arrow, U+2193 ISOnum "deg", 176, // degree sign, U+00B0 ISOnum "delta", 948, // greek small letter delta, U+03B4 ISOgrk3 "diams", 9830, // black diamond suit, U+2666 ISOpub "divide", 247, // division sign, U+00F7 ISOnum "eacute", 233, // latin small letter e with acute, U+00E9 ISOlat1 "ecirc", 234, // latin small letter e with circumflex, U+00EA ISOlat1 "egrave", 232, // latin small letter e with grave, U+00E8 ISOlat1 "empty", 8709, // empty set = null set, U+2205 ISOamso "emsp", 8195, // em space, U+2003 ISOpub "ensp", 8194, // en space, U+2002 ISOpub "epsilon", 949, // greek small letter epsilon, U+03B5 ISOgrk3 "equiv", 8801, // identical to, U+2261 ISOtech "eta", 951, // greek small letter eta, U+03B7 ISOgrk3 "eth", 240, // latin small letter eth, U+00F0 ISOlat1 "euml", 235, // latin small letter e with diaeresis, U+00EB ISOlat1 "euro", 8364, // euro sign, U+20AC NEW "exist", 8707, // there exists, U+2203 ISOtech "fnof", 402, // latin small letter f with hook = "forall", 8704, // for all, U+2200 ISOtech "frac12", 189, // vulgar fraction one half = fraction one half, U+00BD ISOnum "frac14", 188, // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum "frac34", 190, // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum "frasl", 8260, // fraction slash, U+2044 NEW "gamma", 947, // greek small letter gamma, U+03B3 ISOgrk3 "ge", 8805, // greater-than or equal to, U+2265 ISOtech "gt", 62, // greater-than sign, U+003E ISOnum "hArr", 8660, // left right double arrow, U+21D4 ISOamsa "harr", 8596, // left right arrow, U+2194 ISOamsa "hearts", 9829, // black heart suit = valentine, U+2665 ISOpub "hellip", 8230, // horizontal ellipsis = three dot leader, U+2026 ISOpub "iacute", 237, // latin small letter i with acute, U+00ED ISOlat1 "icirc", 238, // latin small letter i with circumflex, U+00EE ISOlat1 "iexcl", 161, // inverted exclamation mark, U+00A1 ISOnum "igrave", 236, // latin small letter i with grave, U+00EC ISOlat1 "image", 8465, // black-letter capital I = imaginary part, U+2111 ISOamso "infin", 8734, // infinity, U+221E ISOtech "int", 8747, // integral, U+222B ISOtech "iota", 953, // greek small letter iota, U+03B9 ISOgrk3 "iquest", 191, // inverted question mark = turned question mark, U+00BF ISOnum "isin", 8712, // element of, U+2208 ISOtech "iuml", 239, // latin small letter i with diaeresis, U+00EF ISOlat1 "kappa", 954, // greek small letter kappa, U+03BA ISOgrk3 "lArr", 8656, // leftwards double arrow, U+21D0 ISOtech "lambda", 955, // greek small letter lamda, U+03BB ISOgrk3 "lang", 9001, // left-pointing angle bracket = "laquo", 171, // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum "larr", 8592, // leftwards arrow, U+2190 ISOnum "lceil", 8968, // left ceiling = APL upstile, U+2308 ISOamsc "ldquo", 8220, // left double quotation mark, U+201C ISOnum "le", 8804, // less-than or equal to, U+2264 ISOtech "lfloor", 8970, // left floor = APL downstile, U+230A ISOamsc "lowast", 8727, // asterisk operator, U+2217 ISOtech "loz", 9674, // lozenge, U+25CA ISOpub "lrm", 8206, // left-to-right mark, U+200E NEW RFC 2070 "lsaquo", 8249, // single left-pointing angle quotation mark, U+2039 ISO proposed "lsquo", 8216, // left single quotation mark, U+2018 ISOnum "lt", 38, // less-than sign, U+003C ISOnum "macr", 175, // macron = spacing macron = overline = APL overbar, U+00AF ISOdia "mdash", 8212, // em dash, U+2014 ISOpub "micro", 181, // micro sign, U+00B5 ISOnum "middot", 183, // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum "minus", 8722, // minus sign, U+2212 ISOtech "mu", 956, // greek small letter mu, U+03BC ISOgrk3 "nabla", 8711, // nabla = backward difference, U+2207 ISOtech "nbsp", 160, // no-break space = non-breaking space, U+00A0 ISOnum "ndash", 8211, // en dash, U+2013 ISOpub "ne", 8800, // not equal to, U+2260 ISOtech "ni", 8715, // contains as member, U+220B ISOtech "not", 172, // not sign = angled dash, U+00AC ISOnum "notin", 8713, // not an element of, U+2209 ISOtech "nsub", 8836, // not a subset of, U+2284 ISOamsn "ntilde", 241, // latin small letter n with tilde, U+00F1 ISOlat1 "nu", 957, // greek small letter nu, U+03BD ISOgrk3 "oacute", 243, // latin small letter o with acute, U+00F3 ISOlat1 "ocirc", 244, // latin small letter o with circumflex, U+00F4 ISOlat1 "oelig", 339, // latin small ligature oe, U+0153 ISOlat2 "ograve", 242, // latin small letter o with grave, U+00F2 ISOlat1 "oline", 8254, // overline = spacing overscore, U+203E NEW "omega", 969, // greek small letter omega, U+03C9 ISOgrk3 "omicron", 959, // greek small letter omicron, U+03BF NEW "oplus", 8853, // circled plus = direct sum, U+2295 ISOamsb "or", 8744, // logical or = vee, U+2228 ISOtech "ordf", 170, // feminine ordinal indicator, U+00AA ISOnum "ordm", 186, // masculine ordinal indicator, U+00BA ISOnum "oslash", 248, // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 "otilde", 245, // latin small letter o with tilde, U+00F5 ISOlat1 "otimes", 8855, // circled times = vector "ouml", 246, // latin small letter o with diaeresis, U+00F6 ISOlat1 "para", 182, // pilcrow sign = paragraph sign, U+00B6 ISOnum "part", 8706, // partial differential, U+2202 ISOtech "permil", 8240, // per mille sign, U+2030 ISOtech "perp", 8869, // up tack = orthogonal to = "phi", 966, // greek small letter phi, U+03C6 ISOgrk3 "pi", 960, // greek small letter pi, U+03C0 ISOgrk3 "piv", 982, // greek pi symbol, U+03D6 ISOgrk3 "plusmn", 177, // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum "pound", 163, // pound sign, U+00A3 ISOnum "prime", 8242, // prime = minutes = feet, U+2032 ISOtech "prod", 8719, // n-ary product = product sign, U+220F ISOamsb "prop", 8733, // proportional to, U+221D ISOtech "psi", 968, // greek small letter psi, U+03C8 ISOgrk3 "quot", 34, // quotation mark, U+0022 ISOnum "rArr", 8658, // rightwards double arrow, U+21D2 ISOtech "radic", 8730, // square root = radical sign, U+221A ISOtech "rang", 9002, // right-pointing angle bracket = "raquo", 187, // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum "rarr", 8594, // rightwards arrow, U+2192 ISOnum "rceil", 8969, // right ceiling, U+2309 ISOamsc "rdquo", 8221, // right double quotation mark, U+201D ISOnum "real", 8476, // black-letter capital R = real part symbol, U+211C ISOamso "reg", 174, // registered sign = registered trade mark sign, U+00AE ISOnum "rfloor", 8971, // right floor, U+230B ISOamsc "rho", 961, // greek small letter rho, U+03C1 ISOgrk3 "rlm", 8207, // right-to-left mark, U+200F NEW RFC 2070 "rsaquo", 8250, // single right-pointing angle quotation mark, U+203A ISO proposed "rsquo", 8217, // right single quotation mark, U+2019 ISOnum "sbquo", 8218, // single low-9 quotation mark, U+201A NEW "scaron", 353, // latin small letter s with caron, U+0161 ISOlat2 "sdot", 8901, // dot operator, U+22C5 ISOamsb "sect", 167, // section sign, U+00A7 ISOnum "shy", 173, // soft hyphen = discretionary hyphen, U+00AD ISOnum "sigma", 963, // greek small letter sigma, U+03C3 ISOgrk3 "sigmaf", 962, // greek small letter final sigma,U+03C2 ISOgrk3 "sim", 8764, // tilde operator = varies with = similar "spades", 9824, // black spade suit, U+2660 ISOpub "sub", 8834, // subset of, U+2282 ISOtech "sube", 8838, // subset of or equal to, U+2286 ISOtech "sum", 8721, // n-ary summation, U+2211 ISOamsb "sup1", 185, // superscript one = superscript digit one, U+00B9 ISOnum "sup2", 178, // superscript two = superscript digit two = squared, U+00B2 ISOnum "sup3", 179, // superscript three = superscript digit three = cubed, U+00B3 ISOnum "sup", 8835, // superset of, U+2283 ISOtech "supe", 8839, // superset of or equal to, U+2287 ISOtech "szlig", 223, // latin small letter sharp s = ess-zed, U+00DF ISOlat1 "tau", 964, // greek small letter tau, U+03C4 ISOgrk3 "there4", 8756, // therefore, U+2234 ISOtech "theta", 952, // greek small letter theta, U+03B8 ISOgrk3 "thetasym", 977, // greek theta symbol, U+03D1 NEW "thinsp", 8201, // thin space, U+2009 ISOpub "thorn", 254, // latin small letter thorn, U+00FE ISOlat1 "tilde", 732, // small tilde, U+02DC ISOdia "times", 215, // multiplication sign, U+00D7 ISOnum "trade", 8482, // trade mark sign, U+2122 ISOnum "uArr", 8657, // upwards double arrow, U+21D1 ISOamsa "uacute", 250, // latin small letter u with acute, U+00FA ISOlat1 "uarr", 8593, // upwards arrow, U+2191 "ucirc", 251, // latin small letter u with circumflex, U+00FB ISOlat1 "ugrave", 249, // latin small letter u with grave, U+00F9 ISOlat1 "uml", 168, // diaeresis = spacing diaeresis, U+00A8 ISOdia "upsih", 978, // greek upsilon with hook symbol,U+03D2 NEW "upsilon", 965, // greek small letter upsilon, U+03C5 ISOgrk3 "uuml", 252, // latin small letter u with diaeresis, U+00FC ISOlat1 "weierp", 8472, // script capital P = power set = Weierstrass p, U+2118 ISOamso "xi", 958, // greek small letter xi, U+03BE ISOgrk3 "yacute", 253, // latin small letter y with acute, U+00FD ISOlat1 "yen", 165, // yen sign = yuan sign, U+00A5 ISOnum "yuml", 255, // latin small letter y with diaeresis, U+00FF ISOlat1 "zeta", 950, // greek small letter zeta, U+03B6 ISOgrk3 "zwj", 8205, // zero width joiner, U+200D NEW RFC 2070 "zwnj", 8204 // zero width non-joiner, U+200C NEW RFC 2070 }; #define END(v) (v-1 + sizeof v / sizeof v[0]) int lookupCharRef(unsigned char *ref) /* compare chars starting at ref until the ';' char */ { struct EntityData *low = EntityDataLookupTable; struct EntityData *high = END(EntityDataLookupTable); struct EntityData *mid; int len; char *p; int comparison; for (p= (char *)ref, len=0; *p != ';'; p++) len++; // binary search in table while (low <= high) { mid = low + (high - low)/2; if((comparison = strncmp(mid->entity_name, (char *)ref, len)) == 0) return mid->codept; else if (comparison < 0) low = mid + 1; else high = mid - 1; } return -1; // unknown entity } /* convert a hex number to a 4bit nibble */ int hex2nib(const unsigned char c) { int nib; if('0' <= c && c <= '9') nib = c - '0'; else if('a' <= c && c <= 'f') nib = c - 'a' + 0xa; else if('A' <= c && c <= 'F') nib = c - 'A' + 0xa; else nib = -1; return (nib); } /* // convert a potentially escaped url to plain text // also converts number and character references to text. */ int URL_convert_to_plain (const unsigned char *escaped_p, unsigned char *plain_p, int max_size) { unsigned char *e = (unsigned char*)escaped_p; unsigned char *p = plain_p; int high, low; int url_len = 0; assert(escaped_p && plain_p && max_size); /* Now, unescape everything after the scheme. */ while(*e != '\0') { /* % HEX HEX */ if(*e == '%') { /* Get the high nibble of the byte. */ high = hex2nib(e[1]); if(high == -1) { url_len = 0; break; } /* Get the low nibble of the byte. */ low = hex2nib(e[2]); if(low == -1) { url_len = 0; break; } *p++ = (high << 4) | low; url_len++; e+= 3; } if (*e == '&') { /* either a numeric or character reference */ if (*(++e) == '#') { /* numeric reference, convert to code point */ if (*(++e) == 'x') { // hex digits /* Get the high nibble of the byte. */ high = hex2nib(*(++e)); if(high == -1) { url_len = 0; break; } /* Get the low nibble of the byte. */ low = hex2nib(*(++e)); if(low == -1) { url_len = 0; break; } *p++ = (high << 4) | low; } else { // decimal number low = atoi((char *)e); *p++ = low; } } else { /* lookup a character reference */ low = lookupCharRef(++e); if(low == -1) { url_len = 0; break; } *p++ = low; } url_len++; /* move to ending ';' */ while(*e && (*e != ';')) e++; /* move passed the ';' */ if( *e ) e++; } else { *p++ = *e++; url_len++; } /* Reached maximum capacity, so just stop here. */ if((p - plain_p) >= max_size) { url_len = max_size; break; } } plain_p[url_len] = '\0'; return (url_len); } typedef struct url_components_t { unsigned char *scheme_p; /* Scheme component */ unsigned char *auth_p; /* Authority component */ unsigned char *path_p; /* Path component */ unsigned char *query_p; /* Query component */ unsigned char *frag_p; /* Fragment */ unsigned char *end_p; /* End pointer */ unsigned char *last_slash_p; /* Last path segment */ unsigned char *opaque_p; /* Opaque part */ } URL_components_t; /* Valid url characters */ #define IS_URL_CHARS(ch) \ ((ch >= '*' && ch <= '~') || \ ch == '!' || ch == '#' || ch == '$' || ch == '%' || ch == '&') /* Look for ".." or "." as a complete segment. */ #define URL_COUNT_DOTS(ss_p,end_p) \ dot_count = 0; \ for(dot_p=ss_p; dot_p<=end_p; dot_p++) { \ if(*dot_p != '.' || ++dot_count > 2) { \ if(*dot_p != '/') \ dot_count = 0; \ break; \ }\ } /* decompse a url into its components */ URL_type_t assign_components(URL_components_t *comps, const unsigned char *url_p) { unsigned char *p = (unsigned char *)url_p; URL_type_t url_type = URL_any_e; assert(comps && url_p); /* Check for a scheme. */ if(isalpha(*p)) { while(*p != '\0') { if(*p == ':') { comps->scheme_p = p; p++; url_type == URL_abs_e; break; } if(!isalnum(*p) && *p != '+' && *p != '-' && *p != '.') { p = (unsigned char *)url_p; break; } p++; } } /* No scheme exists, reset pointer to beginning. */ if(!comps->scheme_p) p = (unsigned char *)url_p; /* Check for authority or path. */ if(p[0] == '/') { if(p[1] == '/') { comps->auth_p = p; p+=2; } else { comps->path_p = p++; } } else { /* If the absolute uri has an opaque part, put all bytes up to the first '/' begin path marker into the opaque part. */ if(comps->scheme_p && *p != '\0') { comps->opaque_p = p++; while(*p != '\0') { if(!IS_URL_CHARS(*p)) return (URL_illegal_e); if(*p == '/') break; p++; } } else if(*p != '?' && *p != '#' && *p != '\0') { if(!IS_URL_CHARS(*p)) return (URL_illegal_e); comps->path_p = p; } } /* Check for path/query/fragment. */ while(*p != '\0') { if(!IS_URL_CHARS(*p)) return (URL_illegal_e); switch(*p) { case '/': if(!comps->path_p && !comps->query_p && !comps->frag_p) comps->path_p = p; else comps->last_slash_p = p; break; case '?': if(!comps->query_p && !comps->frag_p) comps->query_p = p; break; case '#': if(!comps->frag_p) comps->frag_p = p; break; } p++; } comps->end_p = p; return (URL_any_e); } /* returns the type of the given url, based on its components */ URL_type_t check_type(URL_components_t *comps, const unsigned char *url_p) { URL_type_t url_type; assert(comps && url_p); if(*url_p == '\0') url_type = URL_any_e; else if(comps->scheme_p) url_type = URL_abs_e; else if(comps->frag_p && !comps->auth_p && !comps->path_p) url_type = URL_frag_e; else url_type = URL_rel_e; return (url_type); } /* * URL_check_url -- Check the type of the URL * * Examples of absolute URLs: * "http://example.com/some/text" * Examples of relative URLs: * "../images/photo-1.jpg" * ".././images/photo-1.jpg" */ URL_type_t URL_check_url (const unsigned char *url_p) { URL_components_t url_comps; URL_type_t url_type; assert(url_p); memset(&url_comps, 0, sizeof(url_comps)); url_type = assign_components(&url_comps, url_p); if(url_type != URL_illegal_e) url_type = check_type(&url_comps, (unsigned char *)url_p); return (url_type); } /* * resolve_relative_path_reference * * Purpose: Handle rules 6(c-h) of RFC 2396. * c) Remove occurrences of "./" where "." is a complete path segment. * d) Remove trailing complete "." segments. * e) Remove "/../" segments. * f) Remove trailing complete "/.." segments. * g) If buffer still begins with "..", then we will just accept it. * This is an error condition, but this is a viable option. * h) The remaining buffer is the new URI's path component. * * Params: start and end point to the beginning and ending path URI buffer * inclusively. * * Return: Number of bytes removed from the buffer. */ int resolve_relative_path_reference(unsigned char *start, unsigned char *end) { unsigned char *dot_p; /* Dot counter pointer. */ unsigned char *cur_p; /* Current pointer. */ unsigned char *ss_p; /* Start segment pointer. */ unsigned char *ps_p; /* Prev segment pointer. */ unsigned char *end_p; /* Running end pointer. */ int dot_count = 0; /* How many dot have been found. */ int other_found = 0; assert(start && end && (start <= end) && (*start == '/')); ps_p = ss_p = cur_p = start; end_p = end; cur_p++; while(cur_p <= end_p) { if (*cur_p == '/') { other_found = 0; /* Found start of a segment. */ if(dot_count == 1 || (dot_count == 2 && ps_p != ss_p)) { if(dot_count == 1) { /* Remove all occurences of "./", where "." is a complete path segment. */ memmove(ss_p, cur_p, (end_p - cur_p)+1); end_p -= 2; cur_p = ss_p; } else { URL_COUNT_DOTS(ps_p+1, ss_p); if(dot_count != 2) { /* Remove /../, where != "..". */ memmove(ps_p, cur_p, (end_p - cur_p)+1); end_p -= (cur_p - ps_p); ss_p = ps_p; cur_p = ss_p; /* Find the previous segment. */ while(ps_p > start) if(*--ps_p == '/') break; } else { ps_p = ss_p; ss_p = cur_p; } } cur_p++; } else { /* Set the previous segment pointer and continue. */ ps_p = ss_p; ss_p = cur_p++; } dot_count = 0; } else if(*cur_p == '.') { if(!other_found) dot_count++; cur_p++; } else { other_found = 1; dot_count = 0; cur_p++; } } URL_COUNT_DOTS(ss_p+1, end_p); if(dot_count == 1) { /* Remove "." if at end. */ end_p -= (end_p - ss_p); } else if(dot_count == 2 && ps_p) { URL_COUNT_DOTS(ps_p+1, ss_p); /* Remove /.. if a end and where != "..". */ if(dot_count != 2) end_p -= (end_p - ps_p); } /* Remove all segments of ".." at the beginning. */ ss_p = cur_p = start; cur_p++; while(cur_p <= end_p) { if (*cur_p == '/') { URL_COUNT_DOTS(ss_p+1, cur_p); if(dot_count == 2) { memmove(ss_p, cur_p, (end_p - cur_p)+1); cur_p = ss_p + 1; end_p -= 3; } else { break; } } else { cur_p++; } } /* Return the byte count that we removed. */ return (end - end_p); } /* * _add_component * Purpose: Add the buffer to the current buffer. If the current buffer * length is larger than the maximum buffer size, stop. * * Params: start and end point to the beginning and ending path URI buffer * inclusively. * current buffer pointer. * curlen is the current buffer length. * maxlen is the maximum length that the current buffer can be * * Return: Number of bytes copied. */ int add_component(const unsigned char *start, const unsigned char *end, unsigned char *current, int curlen, int maxlen) { int url_len; assert(start && end && (start <= end) && current); url_len = end - start; if((url_len + curlen) > maxlen) { url_len = maxlen - curlen; curlen = maxlen; } if(url_len) memcpy(current, start, url_len); return (url_len); } #define ADD_COMP(start_p,end_p) \ url_len = add_component((unsigned char*)start_p, (unsigned char*)end_p, up, total_len, max_size); \ total_len += url_len; \ up += url_len; /* * URL_combine_base_and_rel * * Combine base (base_p) and relative (relative_p) URLs into url_p. * Return: Length of the combined uri, 0 if no length -1 if error. */ int URL_combine_abs_and_rel (const unsigned char *base_p, const unsigned char *relative_p, unsigned char *url_p, int max_size) { URL_components_t base_comps; URL_components_t rel_comps; unsigned char *up = url_p; unsigned char *temp_p, *end_p; unsigned char slash_p[] = "/"; int url_len, total_len; URL_type_t url_type; memset(&base_comps, 0, sizeof(base_comps)); memset(&rel_comps, 0, sizeof(rel_comps)); total_len = url_len = 0; *up = '\0'; /* Parse URI base & reference into the potential four components and fragment identifier. */ if( (URL_illegal_e == assign_components(&base_comps, base_p)) || (URL_illegal_e == assign_components(&rel_comps, relative_p)) || (URL_illegal_e == (url_type = check_type(&rel_comps, (unsigned char*)relative_p)))) return (-1); /* Verify that we can combine the two urls. */ if((url_type == URL_abs_e) || (URL_abs_e != check_type(&base_comps, (unsigned char *)base_p))) { ADD_COMP(relative_p, rel_comps.end_p); *up = '\0'; return (url_len); } /* Is a reference to the current document? This will return the fragment identifier or a zero length uri. */ if( !rel_comps.scheme_p && !rel_comps.path_p && !rel_comps.auth_p && !rel_comps.opaque_p && !rel_comps.query_p) { ADD_COMP(relative_p, rel_comps.end_p); *up = '\0'; return (url_len); } ADD_COMP(base_p, base_comps.scheme_p+1); /* If the authority component is defined. */ if(!rel_comps.auth_p) { if((temp_p = (base_comps.auth_p ? base_comps.auth_p : (base_comps.opaque_p ? base_comps.opaque_p : NULL)))) { end_p = (base_comps.path_p ? base_comps.path_p : (base_comps.query_p ? base_comps.query_p : (base_comps.frag_p ? base_comps.frag_p : base_comps.end_p))); ADD_COMP(temp_p, end_p); } /* If the path is a network-path or absolute-path then heirarchical. */ if(rel_comps.path_p && *rel_comps.path_p != '/') { /* The relative path needs to be merged with the base URI's path. */ temp_p = up; /* Add the path delimiter if it dne add it. */ if(!((base_comps.path_p && *base_comps.path_p == '/') || *rel_comps.path_p == '/')) { url_len = add_component((unsigned char*)slash_p, (unsigned char*)(slash_p+1), up, total_len, max_size); total_len += url_len; up += url_len; } /* All but the last segment is copied. So, include the last slash, but no more. */ if(base_comps.path_p) { end_p = (base_comps.last_slash_p ? (base_comps.last_slash_p + 1) : (base_comps.query_p ? base_comps.query_p : (base_comps.frag_p ? base_comps.frag_p : base_comps.end_p))); ADD_COMP(base_comps.path_p, end_p); } /* The reference path component is appended. */ end_p = (rel_comps.query_p ? rel_comps.query_p : (rel_comps.frag_p ? rel_comps.frag_p : rel_comps.end_p)); ADD_COMP(rel_comps.path_p, end_p); url_len = resolve_relative_path_reference(temp_p, up-1); total_len-= url_len; up-= url_len; /* The remaining buffer is the new URI's path component. */ } } /* Add the authority component if it exists. */ if(rel_comps.auth_p) { end_p = (rel_comps.path_p ? rel_comps.path_p : (rel_comps.query_p ? rel_comps.query_p : (rel_comps.frag_p ? rel_comps.frag_p : rel_comps.end_p))); ADD_COMP(rel_comps.auth_p, end_p); } /* Add the relative path if it is an absolute path. */ if(rel_comps.path_p && *rel_comps.path_p == '/') { end_p = (rel_comps.query_p ? rel_comps.query_p : (rel_comps.frag_p ? rel_comps.frag_p : rel_comps.end_p)); ADD_COMP(rel_comps.path_p, end_p); } /* Add the relative query path including any fragment. Add the base path is the relative path dne. Add the path delimiter if it dne. */ if(rel_comps.query_p) { if(!rel_comps.path_p && base_comps.path_p) { if(*base_comps.path_p != '/') { ADD_COMP(slash_p, (slash_p+1)); } end_p = (base_comps.last_slash_p ? base_comps.last_slash_p + 1: (base_comps.query_p ? base_comps.query_p : (base_comps.frag_p ? base_comps.frag_p : base_comps.end_p))); ADD_COMP(base_comps.path_p, end_p); } end_p = rel_comps.frag_p ? rel_comps.frag_p : rel_comps.end_p; ADD_COMP(rel_comps.query_p, end_p); } /* Add the relative fragment path including any fragment. */ if(rel_comps.frag_p) { ADD_COMP(rel_comps.frag_p, rel_comps.end_p); } *up = '\0'; return (total_len); } /* * URL_extract_base -- Extract base URL from url_p (plain) * Extract base URL from url_p (must be escaped) into base_p (buffer max-size * bytes). It is only safe for the buffers to overlap at the beginning * (url_p == base_p). * * Return: Length of the extracted base uri, zero if no lenght of error. */ int URL_extract_base (const unsigned char *url_p, unsigned char *base_p, int max_size) { unsigned char *end_p; URL_components_t base_comps; URL_type_t url_type = URL_any_e; int url_len = 0; assert(url_p && base_p && max_size); memset(&base_comps, 0, sizeof(base_comps)); url_type = assign_components(&base_comps, url_p); if(url_type != URL_illegal_e) url_type = check_type(&base_comps, (unsigned char *)url_p); if(url_type != URL_abs_e) return (0); end_p = (base_comps.query_p ? base_comps.query_p : (base_comps.frag_p ? base_comps.frag_p : base_comps.end_p)); url_len = end_p - url_p; if(url_len > max_size) url_len = max_size; memcpy(base_p, url_p, url_len); base_p[url_len] = '\0'; return (url_len); }