The CLEAN Filter

As a more practical example of MS-DOS filters, let's look at a simple but very useful filter called CLEAN. Figures 15-3 and 15-4 show the assembly-language and C source code for this filter. CLEAN processes a text stream by stripping the high bit from all characters, expanding tabs to spaces, and throwing away all control codes except carriage returns, linefeeds, and formfeeds. Consequently, CLEAN can transform almost any kind of word-processed document file into a plain ASCII text file.

name clean

page 55,132

title CLEAN--Text-file filter

;

; CLEAN.ASM Filter to turn document files into

; normal text files.

;

; Copyright 1988 Ray Duncan

;

; Build: C>MASM CLEAN;

; C>LINK CLEAN;

;

; Usage: C>CLEAN <infile >outfile

;

; All text characters are passed through with high

; bit stripped off. Formfeeds, carriage returns,

; and linefeeds are passed through. Tabs are expanded

; to spaces. All other control codes are discarded.

;

tab equ 09h ; ASCII tab code

lf equ 0ah ; ASCII linefeed

ff equ 0ch ; ASCII formfeed

cr equ 0dh ; ASCII carriage return

blank equ 020h ; ASCII space code

eof equ 01ah ; Ctrl-Z end-of-file

tabsiz equ 8 ; width of tab stop

bufsiz equ 128 ; size of input and

; output buffers

stdin equ 0000 ; standard input handle

stdout equ 0001 ; standard output handle

stderr equ 0002 ; standard error handle

_TEXT segment word public 'CODE'

assume cs:_TEXT,ds:_DATA,es:_DATA,ss:STACK

clean proc far ; entry point from MS-DOS

push ds ; save DS:0000 for final

xor ax,ax ; return to MS-DOS, in case

push ax ; function 4ch can't be used

mov ax,_DATA ; make data segment addressable

mov ds,ax

mov es,ax

mov ah,30h ; check version of MS-DOS

int 21h

cmp al,2 ; MS-DOS 2.0 or later?

jae clean1 ; jump if version OK

; MS-DOS 1, display error

; message and exit...

mov dx,offset msg1 ; DS:DX = message address

mov ah,9 ; function 9 = display string

int 21h ; transfer to MS-DOS

ret ; then exit the old way

clean1: call init ; initialize input buffer

clean2: call getc ; get character from input

jc clean9 ; exit if end of stream

and al,07fh ; strip off high bit

cmp al,blank ; is it a control char?

jae clean4 ; no, write it

cmp al,eof ; is it end of file?

je clean8 ; yes, write EOF and exit

cmp al,tab ; is it a tab?

je clean6 ; yes, expand it to spaces

cmp al,cr ; is it a carriage return?

je clean3 ; yes, go process it

cmp al,lf ; is it a linefeed?

je clean3 ; yes, go process it

cmp al,ff ; is it a formfeed?

jne clean2 ; no, discard it

clean3: mov column,0 ; if CR, LF, or FF,

jmp clean5 ; reset column to zero

clean4: inc column ; if non-control character,

; increment column counter

clean5: call putc ; write char to stdout

jnc clean2 ; if disk not full,

; get another character

; write failed...

mov dx,offset msg2 ; DS:DX = error message

mov cx,msg2_len ; CX = message length

mov bx,stderr ; BX = standard error handle

mov ah,40h ; function 40h = write

int 21h ; transfer to MS-DOS

mov ax,4c01h ; function 4ch = terminate

; return code = 1

int 21h ; transfer to MS-DOS

clean6: mov ax,column ; tab code detected

cwd ; tabsiz - (column MOD tabsiz)

mov cx,tabsiz ; is number of spaces needed

idiv cx ; to move to next tab stop

sub cx,dx

add column,cx ; also update column counter

clean7: push cx ; save spaces counter

mov al,blank ; write an ASCII space

call putc

pop cx ; restore spaces counter

loop clean7 ; loop until tab stop

jmp clean2 ; get another character

clean8: call putc ; write EOF mark

clean9: call flush ; write last output buffer

mov ax,4c00h ; function 4ch = terminate

; return code = 0

int 21h ; transfer to MS-DOS

clean endp

getc proc near ; get character from stdin

; returns carry = 1 if

; end of input, else

; AL = char, carry = 0

mov bx,iptr ; get input buffer pointer

cmp bx,ilen ; end of buffer reached?

jne getc1 ; not yet, jump

; more data is needed...

mov bx,stdin ; BX = standard input handle

mov cx,bufsiz ; CX = length to read

mov dx,offset ibuff ; DS:DX = buffer address

mov ah,3fh ; function 3fh = read

int 21h ; transfer to MS-DOS

jc getc2 ; jump if read failed

or ax,ax ; was anything read?

jz getc2 ; jump if end of input

mov ilen,ax ; save length of data

xor bx,bx ; reset buffer pointer

getc1: mov al,[ibuff+bx] ; get character from buffer

inc bx ; bump buffer pointer

mov iptr,bx ; save updated pointer

clc ; return character in AL

ret ; and carry = 0 (clear)

getc2: stc ; end of input stream

ret ; return carry = 1 (set)

getc endp

putc proc near ; send character to stdout,

; returns carry = 1 if

; error, else carry = 0

mov bx,optr ; store character into

mov [obuff+bx],al ; output buffer

inc bx ; bump buffer pointer

cmp bx,bufsiz ; buffer full?

jne putc1 ; no, jump

mov bx,stdout ; BX = standard output handle

mov cx,bufsiz ; CX = length to write

mov dx,offset obuff ; DS:DX = buffer address

mov ah,40h ; function 40h = write

int 21h ; transfer to MS-DOS

jc putc2 ; jump if write failed

cmp ax,cx ; was write complete?

jne putc2 ; jump if disk full

xor bx,bx ; reset buffer pointer

putc1: mov optr,bx ; save buffer pointer

clc ; write successful,

ret ; return carry = 0 (clear)

putc2: stc ; write failed or disk full,

ret ; return carry = 1 (set)

putc endp

init proc near ; initialize input buffer

mov bx,stdin ; BX = standard input handle

mov cx,bufsiz ; CX = length to read

mov dx,offset ibuff ; DS:DX = buffer address

mov ah,3fh ; function 3fh = read

int 21h ; transfer to MS-DOS

jc init1 ; jump if read failed

mov ilen,ax ; save actual bytes read

init1: ret

init endp

flush proc near ; flush output buffer

mov cx,optr ; CX = bytes to write

jcxz flush1 ; exit if buffer empty

mov dx,offset obuff ; DS:DX = buffer address

mov bx,stdout ; BX = standard output handle

mov ah,40h ; function 40h = write

int 21h ; transfer to MS-DOS

flush1: ret

flush endp

_TEXT ends

_DATA segment word public 'DATA'

ibuff db bufsiz dup (0) ; input buffer

obuff db bufsiz dup (0) ; output buffer

iptr dw 0 ; ibuff pointer

ilen dw 0 ; bytes in ibuff

optr dw 0 ; obuff pointer

column dw 0 ; current column counter

msg1 db cr,lf

db 'clean: need MS-DOS version 2 or greater.'

db cr,lf,'$'

msg2 db cr,lf

db 'clean: disk is full.'

db cr,lf

msg2_len equ $-msg2

_DATA ends

STACK segment para stack 'STACK'

dw 64 dup (?)

STACK ends

end clean

Figure 15-3. CLEAN.ASM, the source code for the MASM version of the CLEAN filter.

/*

CLEAN.C Filter to turn document files into

normal text files.

Copyright 1988 Ray Duncan

Compile: C>CL CLEAN.C

Usage: C>CLEAN <infile >outfile

All text characters are passed through with high bit stripped

off. Formfeeds, carriage returns, and linefeeds are passed

through. Tabs are expanded to spaces. All other control codes

are discarded.

*/

#include <stdio.h>

#define TAB_WIDTH 8 /* width of a tab stop */

#define TAB '\x09' /* ASCII tab character */

#define LF '\x0A' /* ASCII linefeed */

#define FF '\x0C' /* ASCII formfeed */

#define CR '\x0D' /* ASCII carriage return */

#define BLANK '\x20' /* ASCII space code */

#define EOFMK '\x1A' /* Ctrl-Z end of file */

main(int argc, char *argv[])

{

char c; /* character from stdin */

int col = 0; /* column counter */

while((c = getchar()) != EOF) /* read input character */

{

c &= 0x07F; /* strip high bit */

switchÓ /* decode character */

{

case LF: /* if linefeed or */

case CR: /* carriage return, */

col=0; /* reset column count */

case FF: /* if formfeed, carriage */

wcharÓ; /* return, or linefeed, */

break; /* pass character through */

case TAB: /* if tab, expand to spaces*/

do wchar(BLANK);

while((++col % TAB_WIDTH) != 0);

break;

default: /* discard other control */

if(c >= BLANK) /* characters, pass text */

{ /* characters through */

wcharÓ;

col++; /* bump column counter */

}

break;

}

}

wchar(EOFMK); /* write end-of-file mark */

exit(0);

}

/*

Write a character to the standard output. If

write fails, display error message and terminate.

*/

wchar(char c)

{

if((putcharÓ == EOF) && (c != EOFMK))

{

fputs("clean: disk full",stderr);

exit(1);

}

}

Figure 15-4. CLEAN.C, the source code for the C version of the CLEAN filter.

When using the CLEAN filter, you must specify the source and destination files with redirection parameters in the command line; otherwise, CLEAN will simply read the keyboard and write to the display. For example, to filter the document file MYFILE.DOC and leave the result in the file MYFILE.TXT, you would enter the following command:

C>CLEAN <MYFILE.DOC >MYFILE.TXT <Enter>

(Note that the original file, MYFILE.DOC, is unchanged.)

One valuable application of this filter is to rescue assembly-language source files. If you accidentally edit such a source file in document mode, the resulting file may cause the assembler to generate spurious or confusing error messages. CLEAN lets you turn the source file back into something the assembler can cope with, without losing the time you spent to edit it.

Another handy application for CLEAN is to list a word-processed document in raw form on the printer, using a command such as

C>CLEAN <MYFILE.DOC >PRN <Enter>

Contrasting the C and assembly-language versions of this filter provides some interesting statistics. The C version contains 79 lines and compiles to a 5889-byte .EXE file, whereas the assembly-language version contains 265 lines and builds an 1107-byte .EXE file. The size and execution-speed advantages of implementing such tools in assembly language is obvious, even compared with such an excellent compiler as the Microsoft C Optimizing Compiler. However, you must balance performance considerations against the time and expense required for programming, particularly when a program will not be used very often.