/*-
 * Copyright (c) 2009 Thomas Hurst <tom@hur.st>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "html.h"

char *
html_entities_ascii_buffer(size_t len)
{
	return malloc((len * MAX_ENTITY_SIZE) + 1);
}

char *
html_entities_ascii(char *buf, size_t len, char *escaped)
{
	unsigned int i,j = 0;
	unsigned int *chunk;
	unsigned int masked;
	static char *hex = "0123456789ABCDEF";
	unsigned char c;

	for (i = 0; i < len; i++)
	{
		// Checking longs is very fast, but only on data which contains
		// long strings of alphabetic characters without numbers or spaces.
		// Message-ID's usually contain numbers, with the occasional static
		// string like "powerpost", which we can reduce to "powe" "rpos" "t"
		while (len - i >= 4)
		{
			// TODO: Sparc will need an alignment check here
			chunk = (unsigned int *)((char *)buf + i);
			masked = *chunk & 0xc0c0c0c0;
			if (masked == 0x40404040)
			{
				//printf("Skipping 4 bytes for %.4s (%X)\n", buf + i, masked);
				*(unsigned int *)(escaped + j) = *chunk;
				j += sizeof(int);
				i += sizeof(int);
			}
			else break;
			/*
			 * This turned out to be slower.  It's also endien-dependent.
			 */
			/*
			switch (masked)
			{
				case 0x40404040:
					//printf("Skipping 4 bytes for %.4s (%X)\n", buf + i, masked);
					*(unsigned int *)(escaped + j) = *chunk;
					j += sizeof(int);
					i += sizeof(int);
					break;
				case 0x00404040: // last char is bad, first three are ok
					//printf("Skipping 3 bytes for %.4s (%X)\n", buf + i, masked);
					escaped[j++] = ((char *)chunk)[0];
					escaped[j++] = ((char *)chunk)[1];
					escaped[j++] = ((char *)chunk)[2];
					i+= 3;
					goto one_char;
				case 0x40004040: // third character is bad, do 2
				case 0x00004040: // last two chars are bad
					//printf("Skipping 2 bytes for %.4s (%X)\n", buf + i, masked);
					escaped[j++] = ((char *)chunk)[0];
					escaped[j++] = ((char *)chunk)[1];
					i+= 2;
					goto one_char;
				case 0x40400040: // second is bad
				case 0x00400040: // ditto
				case 0x40000040: // ditto
				case 0x00000040: // only the first character is ok
					//printf("Skipping 1 byte for %.4s\n", buf + i);
					escaped[j++] = ((char *)chunk)[0];
					i++;
					// fall through
				default: goto one_char;
			}
			*/
		}

one_char:
		c = buf[i];
		switch (c)
		{
			case '&':
				memcpy(escaped + j, "&amp;", 5);
				j += 5;
				break;
			case '<':
				memcpy(escaped + j, "&lt;", 4);
				j += 4;
				break;
			case '"':
				memcpy(escaped + j, "&quot;", 6);
				j += 6;
				break;
			default:
				if (c <= 8 || c == 0xb || c == 0xc ||
				   (c >= 0xe && c <= 0x1f) || (c >= 0x7f && 0x84) ||
				   (c >= 0x86 && c <= 0x9f))
				{
					escaped[j++] = '&';
					escaped[j++] = '#';
					escaped[j++] = 'x';
					// j += sprintf(..) reduces performance for some reason
					// Also, shorter sprintf's are faster.
				//	sprintf(escaped + j, "%.2X", c);
				//	j += 2;
					escaped[j++] = (hex[c >> 4 & 0x7f % 16]);
					escaped[j++] = (hex[c & 0x7f % 16]);
					escaped[j++] = ';';
				}
				else
					escaped[j++] = c;
		}
	}
	escaped[j] = '\000';
	return escaped;
}

#ifdef BUILD_TEST
int
main(void)
{
	//char *buf = "foof$&b\"ar\"\001@moo<moo>";
	char *buf = "part1of201.i5WqnoDaVEHByaHWA&QmE@powerpost2000AA.local";
	//char *buf = "\001&\"\xff\xf3<>";
	size_t len = strlen(buf);
	char *e = html_entities_ascii_buffer(len);
	printf("in: %s (%ld bytes)\n", buf, len);
	e = html_entities_ascii(buf, len, e);
	printf("out: %s\n", e);

#if 1
	int i;
	for (i=0; i < 1000000; i++)
	{
		html_entities_ascii(buf, len, e);
	}
#endif

	free(e);
	return 0;
}
#endif

