TDCPARSE.H

//------------------------------------------------------------------------ 
//
// Tabular Data Control Parse Module
// Copyright (C) Microsoft Corporation, 1996, 1997
//
// File: TDCParse.h
//
// Contents: Declaration of the TDC parser classes.
//
// The intent of these classes once was to create a pipeline.
//
//
// |
// | Wide-character stream
// | ~~~~~~~~~~~~~~~~~~~~~
// \|/
// ------------------------
// | CTDCTokenise object | Created with field & row delimiters, quote &
// | AddWcharBuffer() | escape characters
// ------------------------
// |
// | Stream of <field>, <eoln> and <eof> tokens
// | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// \|/
// ------------------------
// | CTDCFieldSink object | Abstract class, e.g. STD object created with
// | AddField() | sort/filter criteria & fUseHeader flag
// | EOLN() | to interpret the sequence of fields.
// | EOF() |
// ------------------------
//
//------------------------------------------------------------------------

#define DEFAULT_FIELD_DELIM L","
#define DEFAULT_ROW_DELIM L"\n"
#define DEFAULT_QUOTE_CHAR L"\""

#define UNICODE_CP 1200 // Win32's Unicode codepage
#define UNICODE_REVERSE_CP 1201 // Byte-swapped Unicode codepage

#define ALLOW_DOMAIN_STRING L"@!allow_domains"

//------------------------------------------------------------------------
//
// Class: CTDCFieldSink
//
// This class accumulates a sequence of <fields> and <eoln> tokens
// into a 2-D array.
//
// An admissible calling sequence on this object is:
// * 0 or more calls to AddField() or EOLN()
// * 1 call to EOF()
//
//------------------------------------------------------------------------

class CTDCFieldSink
{
public:
STDMETHOD(AddField)(LPWCH pwch, DWORD dwSize) PURE;
STDMETHOD(EOLN)() PURE;
STDMETHOD(EOF)() PURE;
};

//------------------------------------------------------------------------
//
// Class: CTDCUnify
//
// This class takes a series of byte buffers and breaks them up into
// UNICODE buffers.
// The resulting buffers are passed to a CTDCTokenise object.
//
// An admissible calling sequence on this object is:
// * Exactly 1 call to Create()
// * 0 or more calls to AddByteBuffer() with a non-zero-sized buffer
// * Exactly 1 call to AddByteBuffer() with a zero-sized buffer
//
// Calls to query the characteristics of the parsed data are allowed
// after the call to Create(), but are only meaningful after a
// reasonable amount of data has been collected.
//
//
// Caveats:
// ~~~~~~~
// The class characterises the input stream as ASCII/UNICODE/COMPOSITE
// based on the buffer passed in the initial call to AddByteBuffer().
// If this buffer is too small, the class may make an incorrect
// characterisation.
//
//------------------------------------------------------------------------

class CTDCUnify
{
public:
CTDCUnify();
~CTDCUnify();
HRESULT Create(UINT nCodePage, IMultiLanguage *pML);
HRESULT ConvertByteBuffer(BYTE *pBytes, DWORD dwSize);
HRESULT InitTokenizer(CTDCFieldSink *pFieldSink,
WCHAR wchDelimField,
WCHAR wchDelimRow,
WCHAR wchQuote,
WCHAR wchEscape);
HRESULT AddWcharBuffer(BOOL fAtEnd);
int IsUnicode(BYTE * pBytes, DWORD dwSize);
enum ALLOWDOMAINLIST
{
ALLOW_DOMAINLIST_YES,
ALLOW_DOMAINLIST_NO,
ALLOW_DOMAINLIST_DONTKNOW
};

ALLOWDOMAINLIST CheckForAllowDomainList();
HRESULT MatchAllowDomainList(LPCWSTR pwzURL);

private:
CTDCFieldSink *m_pFieldSink;
WCHAR m_wchDelimField;
WCHAR m_wchDelimRow;
WCHAR m_wchQuote;
WCHAR m_wchEscape;
WCHAR m_ucParsed;

boolean m_fEscapeActive;
boolean m_fQuoteActive;
boolean m_fIgnoreNextLF;
boolean m_fIgnoreNextCR;
boolean m_fIgnoreNextWhiteSpace;
boolean m_fFoldCRLF;
boolean m_fFoldWhiteSpace;

UINT m_nCodePage;
UINT m_nUnicode;
boolean m_fDataMarkedUnicode;
boolean m_fDataIsUnicode;
boolean m_fCanConvertToUnicode;
DWORD m_dwBytesProcessed;
DWORD m_dwConvertMode;

BYTE *m_psByteBuf;
ULONG m_ucByteBufSize;
ULONG m_ucByteBufCount;

WCHAR *m_psWcharBuf;
ULONG m_ucWcharBufSize;
ULONG m_ucWcharBufCount;

IMultiLanguage *m_pML;
};