/*
  This source is part of PCOak, an electronic mailer for DOS based on PCElm.

  PCElm is Copyright (c) 1988-1993 Martin Freiss and Wolfgang Siebeck
           Copyright (c) 1992-1999 Demon Internet
  PCOak is Copyright (c) 2000-2002 Simon Turner, Pete Disdale and dispc members

  Thanks to an agreement between the original PCElm authors and Demon Internet
  made in late 1999:

	This program is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License, version 1, as
	published by the Free Software Foundation.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	See the file COPYING, which contains a copy of the GNU General
	Public License.
*/

/*
 * parser.c -- functions for parsing address lists
 */

#include <stdio.h>
#include "pcoak.h"
#include "chars.h"

#include "ustring.h"	/* just for warn_overrun(), nothing else */


/*
 * A state marker to describe what state we're in
 */
enum addrstate {
    IN_NOTHING,		/* not in anything: whitespace */
    START_WORD,		/* at the start of a plain word */
    IN_WORD,		/* within a plain word */
    END_WORD,		/* at the end of a word (plain or address) */
    START_ADDR,		/* at the start of an address word (containing "@") */
    IN_ADDR,		/* within an address word (containing "@") */
    START_COMMENT,	/* at the start of a comment (...) */
    IN_COMMENT,		/* within a comment (...) */
    END_COMMENT,	/* at the end of a comment (...) */
    START_QUOTE,	/* at the start of a quoted-string "..." */
    IN_QUOTE,		/* within a quoted-string "..." */
    END_QUOTE,		/* at the end of a quoted-string "..." */
    START_ROUTE,	/* at the start of a route-addr <...> */
    IN_ROUTE,		/* within a route-addr <...> */
    END_ROUTE		/* at the end of a route-addr <...> */
};
    
/*
 * Define a structure to mark the start and end of a block: this may be a word
 * (delimited by whitespace and/or commas), a comment (...), a quoted-string
 * "...", or a route-addr <...>.  block.start is the first character of the
 * block; block.end is the last character of the block.
 */
struct block {		/* Structure for a block */
    const char *start;		/* Pointer to first character in the block */
    const char *end;		/* Pointer to last character in the block */
};


/*
 * next_address()
 *
 * <from> is the current position in a mail address list (To, Cc, Bcc).
 * <into> is a different buffer, of size <intosize>, into which the next
 * complete address from the list should be copied (this may be NULL, in which
 * case we don't copy anything).  If specified, <addr_ptr> and <addr_len> are
 * to be filled in with the start and length of the actual SMTP addr-spec
 * address within the <into> buffer -- unless <into> is NULL, in which case
 * they refer to the <from> buffer instead.  (This <addr_ptr> is the actual
 * RFC 822 addr-spec -- i.e. just "local-part@domain".)  We return a pointer
 * to the start of the next address in <from> (which may just be the
 * terminator, if this is the last address), or NULL if the list is empty.
 *
 * This function combines the actions of finding the next single address, and
 * parsing enough of it to determine the actual addr-spec SMTP address.
 *
 * Distinct addresses should normally be separated by commas, but if the list
 * of addresses has been edited by the user, there may be addresses which are
 * only separated by whitespace (e.g. "john pete simon").  We expect valid
 * addresses to take one of the following forms:
 *
 *   (a)  simon@twoplaces.co.uk (Simon Turner)
 *   (a1) simon@twoplaces.co.uk				[subset of (a)]
 *   (b)  Simon Turner <simon@twoplaces.co.uk>
 *   (b1) "Simon Turner" <simon@twoplaces.co.uk>	[alternative to (b)]
 *   (b2) <simon@twoplaces.co.uk>			[subset of (b)]
 *   (c)  simon
 *
 * Using RFC 822 terminology, (a) and (a1) are RFC 822 "addr-spec", with and
 * without a comment (the comment can legally go anywhere, but conventionally
 * follows the address); (b) and (b1) are "phrase route-addr"; (b2) is just a
 * "route-addr" (strictly illegal in RFC 822, permitted by RFC 1123).  The
 * single name in (c) is not a valid RFC 822 address (it is neither "mailbox"
 * nor "group") but we treat as though it were valid; it is either a local
 * user, or an alias waiting to be expanded.
 *
 * Obviously we regard a comma "," which is neither quoted, nor in a comment,
 * as the end of an address.  However, there may be addresses in one of the
 * above forms which are separated only by whitespace; how do we know where to
 * split them?
 *
 * A single word, containing no whitespace, may be either part of the "phrase"
 * for a (b) format address, or an address in its own right.  If it contains
 * an "@", and is not quoted, then it must be an address, so we stop at the
 * end of the word (unless it is followed by a comment; see below).  If it
 * does not contain an "@", we note its presence and keep scanning; if we come
 * across another word which *does* contain an "@", then that later word must
 * be an address, which means that all earlier words must be addresses too; we
 * stop at the end of the first word.
 *
 * If we only find more "@"-less words, they may all be part of a format (b)
 * "phrase"; if we find a "route-addr" <...>, all words that have been passed
 * so far are assumed to be part of its leading "phrase".  If, however, we
 * find a comment (...), the last word before the comment is an (a) format
 * address; all words before that are addresses in their own right, so we stop
 * after the first such word.
 *
 * If we find a quoted-string "..."  followed by a route-addr <...>, this
 * "quoted-string route-addr" is a complete (b1) format address, so we assume
 * that all distinct words before the start of the quoted-string point are
 * addresses in their own rights.  A quoted-string which is *not* followed by
 * a route-addr is assumed to be an address in its own right; therefore any
 * words before it must be addresses as well.
 *
 * This means that finding a quoted-string means that all words found before
 * now are addresses; the quoted-string itself may be an address in its own
 * right, or the phrase part of a (b) address, but we won't know until we've
 * seen whether there's a route-addr following.  If we find the start of
 * anything except a route-addr and we've previously seen a complete
 * quoted-string, then the quoted-string is an address in its own right.
 *
 * If we find a word containing an "@", it is an address; but it may have a
 * following comment (...) as in (a) above.  We keep scanning after the end of
 * such a word; if the next non-whitespace thing we find is *not* a '('
 * indicating the start of a comment, we stop at the end of the word.  If we
 * do find a comment, we scan until we reach the end of the comment, and then
 * (provided we saw a word -- with or without "@" -- before the comment) stop
 * after the ')'; if we *didn't* see a word before the comment, we regard the
 * comment as a pre-comment for the *next* word instead, and regard the next
 * word as an address (unless it's a quoted-string or a route-addr, in which
 * case we just ignore the comment).
 *
 * What is being described here is emphatically *NOT* a proper RFC 822 parser;
 * we're using whitespace as a delimiter, and RFC 822 expressly permits
 * LWSP-chars all over the place (including in the middle of an addr-spec); we
 * make no attempt to handle quotes in the full RFC 822 way, but only in the
 * commonly-found ways; also, of the above examples, (c) is invalid under RFC
 * 822.  Given that the user wants/needs to be able to enter addresses (either
 * directly, or from an alias) like
 *	john pete "Simon" <simon@isp> dave@home (Dave)
 * which is completely non-compliant with RFC 822, the last thing we want here
 * is a rigidly-enforced full RFC 822 parser (or so it seems to me).
 *
 * A few examples to illustrate how address lists may look silly, but should
 * be resolvable into something that can be justified under the rules above:
 *
 * simon (Simon) john pete
 * -----------------------
 * 	(a) simon (Simon)		[ addr-spec = "simon" ]
 * 	(b) john			[ addr-spec = "john" ]
 * 	(c) pete			[ addr-spec = "pete" ]
 *
 * simon (Simon) john pete <pete@isp>
 * ----------------------------------
 *	(a) simon (Simon)		[ addr-spec = "simon" ]
 *	(b) john pete <pete@isp>	[ addr-spec = "pete@isp" ]
 *
 * simon (Simon) john "pete" <pete@isp>
 * ------------------------------------
 *	(a) simon (Simon)		[ addr-spec = "simon" ]
 *	(b) john			[ addr-spec = "john" ]
 *	(c) "pete" <pete@isp>		[ addr-spec = "pete@isp" ]
 *
 * john pete "Simon" <simon@isp> dave@home (Dave)
 * ----------------------------------------------
 *	(a) john			[ addr-spec = "john" ]
 *	(b) pete			[ addr-spec = "pete" ]
 *	(c) "Simon" <simon@isp>		[ addr-spec = "simon@isp" ]
 *	(d) dave@home (Dave)		[ addr-spec = "dave@home" ]
 *
 * "pete" john <john21> dave simon (Simon) (Sarah) sarah cliff@home
 * ----------------------------------------------------------------
 *	(a) "pete"			[ addr-spec = "pete" ]
 *	(b) john <john21>		[ addr-spec = "john21" ]
 *	(c) dave			[ addr-spec = "dave" ]
 *	(d) simon (Simon)		[ addr-spec = "simon" ]
 *	(e) (Sarah) sarah		[ addr-spec = "sarah" ]
 *	(f) cliff@home			[ addr-spec = "cliff@home" ]
 *
 * simon john pete <dave>
 * ----------------------
 *	(a) simon john pete <dave>	[ addr-spec = "dave" ]
 *
 * simon john pete@isp <dave>
 * --------------------------
 *	(a) simon			[ addr-spec = "simon" ]
 *	(b) john			[ addr-spec = "john" ]
 *	(c) pete@isp			[ addr-spec = "pete@isp" ]
 *	(d) <dave>			[ addr-spec = "dave" ]
 *
 * Note that, given our desire to allow whitespace to be used as an address
 * separator, we're not going to support whitespace within an addr-spec (even
 * though RFC 822 permits this), nor random comments scattered through the
 * text.  That would be *too* silly.
 *
 * The way we achieve this is by keeping tally of state changes (start/in/end
 * of various states: comment, quote, route, word, address) and by examining
 * what's going on when the state changes.  The following are the rules to
 * obey:
 *
 * 1. Start of 2nd or later word, and we've seen an "@" somewhere:
 *	Take the first word on its own.
 *
 * 2. In the 2nd or later word, and we've just seen an "@":
 *	Take the first word on its own.
 *
 * 3. Start of 2nd or later word, and we've seen a comment before:
 *	Take the comment and the first word.
 *
 * 4. Start of quoted-string, and we've seen >= 1 word before:
 *	Take the first word on its own.
 *
 * 5. Start of a route-addr, and we've previously seen an "@":
 *	Take the first word on its own.
 *
 * 6. Start of something other than a route-addr, and we've seen a complete
 *    quoted-string before:
 *	Take the quoted-string on its own.
 *
 * 7. End of a comment, and we've seen > 1 word before:
 *	Take the first word on its own.
 *
 * 8. End of a comment, and we've seen exactly 1 word before:
 *	Take the first word and this comment.
 *
 * 9. End of a route-addr:
 *	Take everything up to the end of this route-addr.
 *
 * 10. If we've reached the end with > 1 word (forced complete):
 *	Take the first word on its own.
 *
 * 11. If we've reached the end with 1 word (forced complete):
 *	Take everything, right up to the end.
 *
 * 12. If we've reached the end with an incomplete quoted-string:
 *	Force the end of the quoted-string, then as 6.
 *
 * 13. If we've reached the end with an incomplete comment:
 *	Force the end of the comment, then as 7 / 8.
 *
 * 14. If we've reached the end with an incomplete route-addr:
 *	Force the end of the route-addr, then as 9.
 */
char *next_address(char *into, size_t intosize, const char *from,
		   char **addr_ptr, size_t *addr_len)
{
    struct block word0, comment, quote, route;
    enum addrstate state;
    BOOL seen_addr = FALSE;
    BOOL state_changed;
    unsigned nwords;
    int bracket_count = 0;
    const char *p, *stop, *astart, *aend;
    char *ip;
    size_t n;

    /*
     * Set all block start/end pointers to NULL; we ensure that the .start
     * pointer gets set first, so we can immediately tell if we've seen a
     * particular type of block by looking to see if its .end is non-NULL.
     */
    word0.start = word0.end = NULL;
    comment.start = comment.end = NULL;
    quote.start = quote.end = NULL;
    route.start = route.end = NULL;

    stop = astart = aend = NULL;

    /*
     * Skip leading whitespace and/or commas, which gives us our starting
     * point; then run through the line, stopping when we're certain we've
     * found a complete address as described in the brief comment above.
     */
    from = skip_sep(from);		/* skip leading LWSP & commas */
    if (!*from)				/* End of the line: nothing to do! */
	return NULL;

    nwords = 0;				/* No words seen yet */
    state = IN_NOTHING;			/* No state yet */

    for (p = from; *p; p++)
    {
	/*
	 * Update any just-changed states from the previous iteration
	 */
	if (state == START_QUOTE)
	    state = IN_QUOTE;
	else if (state == START_COMMENT)
	    state = IN_COMMENT;
	else if (state == START_ROUTE)
	    state = IN_ROUTE;
	else if (state == START_WORD)
	    state = IN_WORD;
	else if (state == START_ADDR)
	    state = IN_ADDR;
	else if (state == END_QUOTE)
	    state = IN_NOTHING;
	else if (state == END_COMMENT)
	    state = IN_NOTHING;
	else if (state == END_ROUTE)
	    state = IN_NOTHING;
	else if (state == END_WORD)
	    state = IN_NOTHING;
	state_changed = FALSE;		/* no changes yet */

	if (state == IN_QUOTE)		/* currently in a quoted-string */
	{
	    if (*p != '\"')		    /* still in the quoted-string */
		continue;
	    quote.end = p;
	    state = END_QUOTE;		    /* ending IN_QUOTE state */
	    state_changed = TRUE;
	    /*  *p == '\"'  */
	}
	else if (state == IN_COMMENT)	/* currently in a comment */
	{
	    if (*p == '(')		    /* start of a nested comment */
	    {
		bracket_count++;
		continue;
	    }
	    if (*p != ')')		    /* still in the comment */
		continue;
	    if (--bracket_count > 0)	    /* still in *a* nested comment */
		continue;
	    comment.end = p;
	    state = END_COMMENT;	    /* ending IN_COMMENT state */
	    state_changed = TRUE;
	    /*  *p == ')'  */
	}
	else if (state == IN_ROUTE)	/* currently in a route-addr */
	{
	    if (*p != '>')		    /* still in the route-addr */
		continue;
	    route.end = p;
	    state = END_ROUTE;		    /* ending IN_ROUTE state */
	    state_changed = TRUE;
	    /*  *p == '>'  */
	}
	else if (*p == '\"')		/* start of a quoted-string */
	{
	    quote.start = p;
	    state = START_QUOTE;	    /* starting IN_QUOTE state */
	    state_changed = TRUE;
	}
	else if (*p == '(')		/* start of a top-level comment */
	{
	    bracket_count++;
	    comment.start = p;
	    state = START_COMMENT;	    /* starting IN_COMMENT state */
	    state_changed = TRUE;
	}
	else if (*p == '<')		/* start of a route-addr */
	{
	    route.start = p;
	    state = START_ROUTE;	    /* starting IN_ROUTE state */
	    state_changed = TRUE;
	}
	else if (*p == ',')		/* end of this block! */
	{
	    break;
	}
	else if (islwsp(*p))		/* LWSP-char */
	{
	    if (state != IN_NOTHING)	    /* just gone over end of word */
	    {
		if (nwords++ == 0)
		    word0.end = p - 1;	    /* word ended at prev. char */
		state = END_WORD;
		state_changed = TRUE;
	    }
	}
	else if (*p == '@')		/* this word is an address for sure */
	{
	    if (state == IN_NOTHING)	    /* start of the word */
	    {
		if (nwords == 0)
		    word0.start = p;
		state = START_ADDR;
		state_changed = TRUE;
	    }
	    else if (state != IN_ADDR)
	    {
		state = IN_ADDR;
		state_changed = TRUE;
	    }
	    seen_addr = TRUE;
	}
	else				/* must be a basic word character */
	{
	    if (state == IN_NOTHING)	    /* start of the word */
	    {
		if (nwords == 0)
		    word0.start = p;
		state = START_WORD;
		state_changed = TRUE;
	    }
	}

	/*
	 * So, here we are.  Can we decide on anything yet, or do we need to
	 * keep scanning?  If the state hasn't just changed, there's nothing
	 * new; keep scanning.  Otherwise, obey rules 1-9 as defined above.
	 */
	if (!state_changed)		/* No new information: carry on */
	    continue;

	if ((state == START_WORD || state == START_ADDR || state == IN_ADDR)
	    && nwords > 1)
	{
	    if (seen_addr ||				/* Rule 1 / 2 */
		comment.end > comment.start)		/* Rule 3 */
	    {
		astart = word0.start;
		aend = stop = word0.end;
		break;
	    }
	}
	if ((state == START_QUOTE && nwords >= 1) ||	/* Rule 4 */
	    (state == START_ROUTE && seen_addr))	/* Rule 5 */
	{
	    astart = word0.start;
	    aend = stop = word0.end;
	    break;
	}
	if ((state == START_WORD || state == START_ADDR ||
	     state == START_QUOTE || state == START_COMMENT) &&
	    quote.end > quote.start + 1)		/* Rule 6 */
	{
	    astart = quote.start + 1;
	    aend = quote.end - 1;
	    stop = quote.end;
	    break;
	}
	if (state == END_COMMENT && nwords >= 1)
	{
	    if (nwords > 1)				/* Rule 7 */
	    {
		astart = word0.start;
		aend = stop = word0.end;
	    }
	    else					/* Rule 8 */
	    {
		astart = word0.start;
		aend = word0.end;
		stop = comment.end;
	    }
	    break;
	}
	if (state == END_ROUTE)				/* Rule 9 */
	{
	    astart = route.start + 1;
	    aend = route.end - 1;
	    stop = route.end;
	    break;
	}
    }

    /*
     * There are three ways of getting here: (a) we reached the end of the
     * line, (b) we found a comma, or (c) we've found an address and are
     * raring to go.  If it's (a) or (b), <stop> et al will still be NULL; so
     * we can use this test to see what's happened.
     */
    if (stop == NULL)		/* Reached the end without finding anything */
    {
	if (state == START_WORD || state == IN_WORD ||	/* was in a word */
	    state == START_ADDR || state == IN_ADDR)
	{
	    if (nwords++ == 0)
		word0.end = p - 1;	/* word ended at prev. char */
	    if (nwords > 1)					/* Rule 10 */
	    {
		astart = word0.start;
		aend = stop = word0.end;
	    }
	    else						/* Rule 11 */
	    {
		astart = word0.start;
		aend = word0.end;
		stop = p - 1;
	    }
	}
	else if (state == IN_NOTHING && nwords > 0)  /* lwsp after 1+ words */
	{
	    if (nwords > 1)					/* Rule 10 */
	    {
		astart = word0.start;
		aend = stop = word0.end;
	    }
	    else						/* Rule 11 */
	    {
		astart = word0.start;
		aend = word0.end;
		stop = p - 1;
	    }
	}
	else if (state == START_QUOTE || state == IN_QUOTE)	/* Rule 12 */
	{
	    quote.end = p;
	    astart = quote.start + 1;
	    aend = quote.end - 1;
	    stop = p - 1;
	}
	else if (state == START_COMMENT || state == IN_COMMENT)	/* Rule 13 */
	{
	    comment.end = p;
	    if (nwords > 1)				/* Rule 13.7 */
	    {
		astart = word0.start;
		aend = stop = word0.end;
	    }
	    else if (nwords == 1)			/* Rule 13.8 */
	    {
		astart = word0.start;
		aend = word0.end;
		stop = p - 1;
	    }
	}
	else if (state == START_ROUTE || state == IN_ROUTE)	/* Rule 14 */
	{
	    route.end = p;
	    astart = route.start + 1;
	    aend = route.end - 1;
	    stop = p - 1;
	}

	if (stop == NULL)		/* Nothing worked: no address! */
	{
	    astart = NULL;
	    aend = NULL;
	    stop = p - 1;
	}
    }

    if (into != NULL && intosize > 0)
    {
	/*
	 * Reduce <intosize> to the number of characters we can copy into the
	 * buffer, excluding the terminating null which we *always* put there;
	 * then copy the address from <from> to <into>, stopping once we've
	 * copied the final character at <stop>; make sure that <into> is
	 * terminated.
	 */
	intosize--;
	for (p = from, ip = into, n = 0; p <= stop && n < intosize;
	     p++, ip++, n++)
	    *ip = *p;
	*ip = '\0';
	if (p <= stop)		/* buffer would have overrun! */
	{
#ifdef REPORT_OVERRUNS
	    warn_overrun(into, intosize + 1, (size_t) (stop - from) + 1,
			 __FILE__, __LINE__, "into");
#endif
	    /* ST-FIXME : try to be "clever" and truncate before bogus bits? */
	}
    }

    /*
     * If specified, set the pointers to the start and length of the actual
     * addr-spec; if <into> is non-NULL, we point to the data within <into>;
     * if not, we point to the data with <from> instead.
     */
    if (addr_ptr)
	*addr_ptr = (!astart) ? NULL : (((into) ? into : ((char *) from)) +
					(int) (astart - from));
    if (addr_len)
	*addr_len = (!astart || aend < astart) ? 0 :
	(size_t) (aend - astart) + 1;		/*lint !e613 */ /* OK */

    return skip_sep(stop + 1);
}
