/* UNHTML -- Remove HTML markup from files.
   Written 2/96
   Copyright 1996, Tom Almy. All rights reserved.
   May be copied freely for non-commercial use.
   May not be sold or incorporated into commercial products.
   Contact the author for details:
   tom.almy%bbbbbs@comm-dat.com, tom.almy@f290.n105.z1.fidonet.org
   */

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

int ch, startline = 1;
int twolineflag=0, dosflag=0, dos2flag=0, winflag=0, markflag=0;
int skipws=0;
int quoting=0;
int indent=0;

typedef struct {
	char in[7];
	char out1d;		/* DOS character (USA codepage) */
	char out1w;		/* Windows character */
	char out2[4];   /* ASCII substitute */
	char use2;		/* 1- use out2 instead of out1d for dos2flag
                       2- diacritical marked character
                       3- diacritical marked character with no DOS equivalent*/
} SUBS;

SUBS a[] = {
	{"odq", '"', 147, "``", 1},	/* don't move these first two entries */
	{"cdq", '"', 148, "''", 1},
	{"OElig", 'E', 140, "OE", 1},
	{"oelig", 'e', 156, "oe", 1},
	{"amp", '&', '&', "&"},
	{"gt", '>', '>', ">"},
	{"lt", '<', '<', "<"},
	{"quot", '"', '"', "\""},
	{"mdash", '-', 151, "--", 1},
	{"osq", '`', 145, "`"},
	{"csq", '\'', 146, "'"},
	{"nbsp", ' ', 160, " "},
	{"#160", ' ', 160, " "},
	{"iexcl", 173, 161, "!"},
	{"#161", 173, 161, "!"},
	{"cent", 155, 162, "cnt"},
	{"#162", 155, 162, "cnt"},
	{"pound", 156, 163, "#"},			/* Octothorpe is Brittish Pound sign in UK */
	{"#163", 156, 163, "#"},			/* Octothorpe is Brittish Pound sign in UK */
	{"curren", '$', 164, "$"},
	{"#164", '$', 164, "$"},
	{"yen", 157, 165, "Y"},
	{"#165", 157, 165, "Y"},
	{"brvbar", '|', 166, "|"},
	{"#166", '|', 166, "|"},
	{"sect", 21, 167, "%"},
	{"#167", 21, 167, "%"},
	{"uml", '"', 168, "\""},
	{"#168", '"', 168, "\""},
	{"cright", 'C', 169, "(C)",1},
	{"#169", 'C', 169, "(C)",1},
	{"ordf", 166, 170, "a"},
	{"#170", 166, 170, "a"},
	{"laquo", 174, 171, "<<"},
	{"#171", 174, 171, "<<"},
	{"not", 169, 172, "~"},
	{"#172", 169, 172, "~"},
	{"shy", '-', 173, "-"},
	{"#173", '-', 173, "-"},
	{"reg", 'R', 174, "(R)",1},
	{"#174", 'R', 174, "(R)",1},
	{"macr", '-', 175, "-"},
	{"#175", '-', 175, "-"},
	{"deg", 248 , 176, "o"},
	{"#176", 248 , 176, "o"},
	{"plusmn", 241, 177, "+/-"},
	{"#177", 241, 177, "+/-"},
	{"sup2", 253, 178, "2"},
	{"#178", 253, 178, "2"},
	{"sup3", '3', 179, "3"},
	{"#179", '3', 179, "3"},
	{"acute", '\'', 180, "'"},
	{"#180", '\'', 180, "'"},
	{"micro", '`', 181, "`"},
	{"#181", '`', 181, "`"},
	{"pilcro", 20, 182, "P"},
	{"#182", 20, 182, "P"},
	{"middot", 249, 183, "."},
	{"#183", 249, 183, "."},
	{"cedil", ',', 184, ","},
	{"#184", ',', 184, ","},
	{"sup1", '1', 185, "1"},
	{"#185", '1', 185, "1"},
	{"ordm", 167, 186, "o"},
	{"#186", 167, 186, "o"},
	{"raquo", 175, 187, ">>"},
	{"#187", 175, 187, ">>"},
	{"frac14", 172, 188, "1/4"},
	{"#188", 172, 188, "1/4"},
	{"frac12", 171, 189, "1/2"},
	{"#189", 171, 189, "1/2"},
	{"frac34", 254, 190, "3/4",1},
	{"#190", 254, 190, "3/4",1},
	{"iquest", 168, 191, "?"},
	{"#191", 168, 191, "?"},
	{"mult", 'x', 215, "*"},
	{"#215", 'x', 215, "*"},
	{"div", 246, 247, "/"},
	{"#247", 246, 247, "/"},
	{"Agrave", 'A', 192, "`A", 3},
	{"agrave", 133, 224, "`a", 2},
	{"Aacute", 'A', 193, "'A", 3},
	{"aacute", 160, 225, "'a", 2},
	{"Acirc", 'A', 194, "^A", 3},
	{"acirc", 131, 226, "^a", 2},
	{"Atilde", 'A', 195 , "~A", 3},
	{"atilde", 'a', 227, "~a", 3},
	{"Auml", 142, 196, ":A", 2},
	{"auml", 132, 228, ":a", 2},
	{"Aring", 143, 197, "A"},
	{"aring", 134, 229, "a"},
	{"AElig", 0222, 198, "AE"},
	{"aelig", 0221, 230, "ae"},
	{"Ccedil", 128, 199, ",C", 2},
	{"ccedil", 135, 231, ",c", 2},
	{"Egrave", 'E', 200, "`E", 3},
	{"egrave", 138, 232, "`e", 2},
	{"Eacute", 144, 201, "'E", 2},
	{"eacute", 130, 233, "'e", 2},
	{"Ecirc", 'E', 202, "^E", 3},
	{"ecirc", 136, 234, "^e", 2},
	{"Euml", 'E', 203, ":E", 3},
	{"euml", 137, 235, ":e", 2},
	{"Igrave", 'I', 204, "`I", 3},
	{"igrave", 141, 236, "`i", 2},
	{"Iacute", 'I', 205, "'I", 3},
	{"iacute", 161, 237, "'i", 2},
	{"Icirc", 'I', 206, "^I", 3},
	{"icirc", 140, 238, "^i", 2},
	{"Iuml", 'I', 207, ":I", 3},
	{"iuml", 139, 239, ":i", 2},
	{"ETH", 'D', 208, "D"},
	{"eth", 'o', 240, "o"},
	{"Ntilde", 165, 209, "~N", 2},
	{"ntilde", 164, 241, "~n", 2},
	{"Ograve", 'O', 210, "`O", 3},
	{"ograve", 149, 242, "`o", 2},
	{"Oacute", 'O', 211, "'O", 3},
	{"oacute", 162, 243, "'o", 2},
	{"Ocirc", 'O', 212, "^O", 3},
	{"ocirc", 147, 244, "^o", 2},
	{"Otilde", 'O', 213, "~O", 3},
	{"otilde", 'o', 245, "~o", 3},
	{"Ouml", 'O', 214, ":O", 3},
	{"ouml", 148, 246, ":o", 2},
	{"Oslash", 'O', 216, "/O", 3},
	{"oslash", 'o', 248, "/o", 3},
	{"Ugrave", 'U', 217, "`U", 3},
	{"ugrave", 151, 249, "`u", 2},
	{"Uacute",'U' , 218, "'U", 3},
	{"uacute", 163, 250, "'u", 2},
	{"Ucirc", 'U', 219, "^U", 3},
	{"ucirc", 150, 251, "^u", 2},
	{"Uuml", 'U', 220, ":U", 3},
	{"uuml", 154, 252, ":u", 2},
	{"Yacute",'Y' , 221, "'Y", 3},
	{"yacute", 'y', 253, "'y", 3},
	{"THORN", 'P', 222, "P"},
	{"thorn", 'p', 254, "p"},
	{"szlig", 'B', 223, "B"},
	{"yuml", 152, 255, ":y", 2},
	{{0},0,0,{0}}
};



void newline(void) {
	/* force the start of a new line */
	putchar('\n');
	if (indent) {
		int i = indent;
		while (i--) putchar(' ');
	}
	startline=1;
}

void newpara(void) {
	/* force start of new paragraph (perhaps extra whitespace) */
	putchar('\n');
	if (twolineflag) putchar('\n');
	if (indent) {
		/* Won't really work well, but nothing we can do about it */
		int i = indent;
		while (i--) putchar(' ');
	}
	startline=1;
}

void cnewline(void) {
	/* force new line if not at start of line */
	if (startline==0) newline();
}

void cnewpara(void) {
	/* force new paragraph if not at start of line */
	if (startline==0) newpara();
}

void mygetchar(void) {
	for (;;) {
		ch = getchar();
		if (ch == '\n' && !quoting) ch = ' ';	/* convert to whitespace */
		if (ch == EOF) {
			cnewline();
			exit(0);
		}
		return;
	}
}

void putTableChar(int i) {
	/* Writes character(s) at table index i */
	if ((dosflag && (!markflag || a[i].use2!=3))
		|| (dos2flag &&
			(a[i].use2==0 ||
			 a[i].use2==2 ||
			 (a[i].use2==3 && !markflag))))
		putchar(a[i].out1d);
	else if ((!markflag && !dosflag && !winflag && !dos2flag) &&
			 a[i].use2 > 1)
		putchar(a[i].out2[1]);	/* diacritical character*/
	else if (winflag) putchar(a[i].out1w);
	else fputs(a[i].out2, stdout);
}

void usage(void) {
	fprintf(stderr, "Usage: unhtml [2][d | D | w][m] <infile >outfile\n"
			"Program acts as a filter. Outputs one line per paragraph,\n"
			"suitable for import into word processors or author's JUSTIFY\n"
			"program to create text files\n"
			"Options:\n"	
			" 2 - output blank line after each paragraph\n"
			" d - IBM-PC 8 bit character set (USA codepage)\n"
			" D - same, but uses multiple character sequences if necessary\n"
			" w - Microsoft Windows character set (ISO)\n"
			" m - multi-character diacritical marked characters if necessary\n"
			"     use with D, w, or default flags\n"
			" (default-- 7 bit ASCII, possible multicharacter sequences)\n\n");
	exit(1);
}


void main(int argc, char **argv) {
	int notflag=0, intitle=0;
	char cmdbuf[20];
	int listlevel = -1; /* not in a list */
	int listcount[10];	/* current counter value at each list level */
	int i;
	char *arglist;

	fprintf(stderr, "HTML removing filter Version 1.0\n"
			"Copyright 1996 by Tom Almy\n");

	if (argc > 2) usage();

	if (argc==2){
		arglist = argv[1];
		while (*arglist != 0) {
			switch (*arglist++) {
				case '2': twolineflag = 1; break;
				case 'd': dosflag = 1; break;
				case 'D': dos2flag = 1; break;
				case 'w': winflag = 1; break;
				case 'm': markflag = 1; break;
				default: usage();
			}
		}
	}

	for (;;) {
		mygetchar();
		if (ch != '<' && ch !='&') {
			if (ch == ' ' && skipws) continue;
			skipws = 0;
			startline=0;
			if (!intitle) putchar(ch);
			if (ch=='\n' && !intitle && quoting && indent != 0) {
				i = indent;
				while (i--) putchar(' ');
			}
			continue;
		}
		if (ch == '&') {
			skipws = 0;
			/* special character processing */
			mygetchar();
			i=0;
			while (ch != ';' && i < 12) {
				cmdbuf[i++] = ch;
				mygetchar();
			}
			cmdbuf[i] = 0;
			if (i > 10) {
				/* bad &; field, should not occur, but I've seen them! */
				if (!intitle) {
					printf("&%s%c", cmdbuf, ch);
					startline = 0;
				}
				continue;
			}
			i = 0;
			while (a[i].in) {
				if (strcmp(a[i].in,cmdbuf)==0) {
					if (!intitle) {
						putTableChar(i);
						startline = 0;
					}
					break;
				}
				i++;
			}
			continue;
		}
		/* process <> command */
		mygetchar();
		notflag = 0;
		if (ch == '/') {
			notflag = 1;
			mygetchar();
		}
		i=0;
		while (ch != ' ' && ch != '>') {
			cmdbuf[i++] = ch;
			mygetchar();
		}
		while (ch != '>') mygetchar();
		cmdbuf[i] = 0;
		while (i-- > 0)
			if (isupper(cmdbuf[i])) cmdbuf[i] = tolower(cmdbuf[i]);
		if (strcmp("p",cmdbuf)==0) {
			/* </p> will end/start a new paragraph, while <p> will start a new
			paragraph if one not already started */
			if (notflag) newpara();
			else {
				cnewpara();
				skipws = 1;  /* Ignore white space after command <p> */
			}
			continue;
		}
		if (strcmp("br",cmdbuf)==0) {
			/* force a line break */
			newline();
/*			skipws= 1; *//* ignore white space */
			continue;
		}
		if (strcmp("hr", cmdbuf) == 0) {
			/* Draw a horizontal line */
			cnewline();
			printf("-------------------------------------------------------");
			newline();
			skipws= 1; /* ignore white space */
			continue;
		}
		if (strcmp("title", cmdbuf) == 0) {
			/* Disable output for length of title */
			intitle = !notflag;
			skipws= 1; /* ignore white space */
			continue;
		}
		if (strcmp("addrline", cmdbuf) == 0) {
			/* address line */
			if (notflag) newline();
			else cnewline();
			skipws=1;
			continue;
		}
		if (strcmp("head", cmdbuf) == 0 ||
			strcmp("titlepart", cmdbuf) == 0 ||
			(strlen(cmdbuf)==2 && cmdbuf[0] == 'h')) {
			/* a heading -- put it on its own line, and space after */
			if (notflag) {
				newline();
				newline();
			}
			else {
				cnewline();
			}
			skipws= 1; /* ignore white space */
			continue;
		}
		if (strcmp("pre", cmdbuf)==0) {
			/* preformatted */
			if (!notflag) cnewline();
			quoting = !notflag;
			continue;
		}
		if (strcmp("blockquote", cmdbuf) == 0||
			strcmp("bq",cmdbuf) == 0) {
			/* Indent -- well this doesn't work particularly well
			because we really want some later program to do the
			formatting */
			indent += (notflag ? -4 : 4);
			continue;
		}
		if (strcmp("l", cmdbuf) == 0) {
			/* I believe this stands for "literal" but can't find it in def */
			skipws=0;
			continue;
		}
		if (strcmp("q", cmdbuf) == 0) {
			/* quotation */
			if (!intitle) {
				putTableChar(notflag ? 1 : 0);
				startline = 0;
			}
			continue;
		}
		if (strcmp("ol", cmdbuf) == 0) {
			skipws = 1;
			/* ordered list */
			if (notflag) {
				if (listlevel > 0) indent -= 4;
				if (listlevel >= 0) listlevel--;
				if (listlevel == -1) newpara();
			}
			else {
				if (listlevel == -1) cnewline(); /* Allow list heading */
				listlevel++;
				listcount[listlevel] = 1; /* start with a count of one */
				if (listlevel > 0) indent += 4;
			}
			continue;
		}
		if (strcmp("ul", cmdbuf) == 0) {
			skipws = 1;
			/* un-ordered list */
			if (notflag) {
				if (listlevel > 0) indent -= 4;
				if (listlevel >= 0) listlevel--;
				if (listlevel == -1) newpara();
			}
			else {
				if (listlevel == -1) cnewline(); /* Allow list heading */
				listlevel++;
				listcount[listlevel] = 0; /* bullets */
				if (listlevel > 0) indent += 4;
			}
			continue;
		}
		if (strcmp("menu", cmdbuf) == 0 ||
			strcmp("dir", cmdbuf) == 0 ||
			strcmp("list", cmdbuf) == 0 ||
			strcmp("dl", cmdbuf) == 0) {
			skipws = 1;
			/* unmarked */
			if (notflag) {
				if (listlevel > 0) indent -= 4;
				if (listlevel >= 0) listlevel--;
				if (listlevel == -1) newpara();
			}
			else {
				if (listlevel == -1) cnewline(); /* Allow list heading */
				listlevel++;
				listcount[listlevel] = -1; /* no markings */
				if (listlevel > 0) indent += 4;
			}
			continue;
		}
		if (strcmp("li", cmdbuf) == 0) {
			/* ignore outside of list, ignore /li */
			if (notflag || listlevel < 0) continue;
			newline();
			if (listcount[listlevel] == 0) {
				fputs("* ", stdout); /* the best bullet we can do */
			}
			else if (listcount[listlevel] > 0) {
				printf("%d ", listcount[listlevel]++);
			}
			skipws = 1;
			startline = 0;
			continue;
		}
		if (strcmp("item", cmdbuf) == 0) {
			if (notflag) newline();
			else cnewline();
			skipws = 1;
			startline = 0;
			continue;
		}
		if (strcmp("dt", cmdbuf) == 0) {
			/* ignore outside of list, ignore /dt */
			if (notflag || listlevel < 0) continue;
			newline();
			skipws = 1;
			startline = 0;
			continue;
		}
		if (strcmp("dd", cmdbuf) == 0) {
			if (listlevel < 0) continue;
			if (notflag) {
				indent -= 4;
			}
			else {
				indent += 4;
				newline();
				startline = 0;
				skipws = 1;
			}
			continue;
		}
		if (strcmp("tr", cmdbuf) == 0) {
			/* table row start */
			if (notflag) continue;
			newline();
			skipws = 1;
			continue;
		}
		if (strcmp("th", cmdbuf) == 0 || strcmp("td", cmdbuf) == 0) {
			/* Table entry start -- may need tab */
			if (notflag) continue;
			if (startline == 0)
				putchar('\t');	/* tab over */
			skipws = 1;
			continue;
		}
		if (strcmp("table", cmdbuf) == 0) {
			/* Table start and end */
			if (notflag) newpara();
			else {
				cnewline();
				skipws = 1;
			}
			continue;
		}
	}
}