As a more practical example of MS-DOS filters, let's look at a simple but very useful filter called CLEAN. Figures 15-3 and 15-4 show the assembly-language and C source code for this filter. CLEAN processes a text stream by stripping the high bit from all characters, expanding tabs to spaces, and throwing away all control codes except carriage returns, linefeeds, and formfeeds. Consequently, CLEAN can transform almost any kind of word-processed document file into a plain ASCII text file.
name clean
page 55,132
title CLEAN--Text-file filter
;
; CLEAN.ASM Filter to turn document files into
; normal text files.
;
; Copyright 1988 Ray Duncan
;
; Build: C>MASM CLEAN;
; C>LINK CLEAN;
;
; Usage: C>CLEAN <infile >outfile
;
; All text characters are passed through with high
; bit stripped off. Formfeeds, carriage returns,
; and linefeeds are passed through. Tabs are expanded
; to spaces. All other control codes are discarded.
;
tab equ 09h ; ASCII tab code
lf equ 0ah ; ASCII linefeed
ff equ 0ch ; ASCII formfeed
cr equ 0dh ; ASCII carriage return
blank equ 020h ; ASCII space code
eof equ 01ah ; Ctrl-Z end-of-file
tabsiz equ 8 ; width of tab stop
bufsiz equ 128 ; size of input and
; output buffers
stdin equ 0000 ; standard input handle
stdout equ 0001 ; standard output handle
stderr equ 0002 ; standard error handle
_TEXT segment word public 'CODE'
assume cs:_TEXT,ds:_DATA,es:_DATA,ss:STACK
clean proc far ; entry point from MS-DOS
push ds ; save DS:0000 for final
xor ax,ax ; return to MS-DOS, in case
push ax ; function 4ch can't be used
mov ax,_DATA ; make data segment addressable
mov ds,ax
mov es,ax
mov ah,30h ; check version of MS-DOS
int 21h
cmp al,2 ; MS-DOS 2.0 or later?
jae clean1 ; jump if version OK
; MS-DOS 1, display error
; message and exit...
mov dx,offset msg1 ; DS:DX = message address
mov ah,9 ; function 9 = display string
int 21h ; transfer to MS-DOS
ret ; then exit the old way
clean1: call init ; initialize input buffer
clean2: call getc ; get character from input
jc clean9 ; exit if end of stream
and al,07fh ; strip off high bit
cmp al,blank ; is it a control char?
jae clean4 ; no, write it
cmp al,eof ; is it end of file?
je clean8 ; yes, write EOF and exit
cmp al,tab ; is it a tab?
je clean6 ; yes, expand it to spaces
cmp al,cr ; is it a carriage return?
je clean3 ; yes, go process it
cmp al,lf ; is it a linefeed?
je clean3 ; yes, go process it
cmp al,ff ; is it a formfeed?
jne clean2 ; no, discard it
clean3: mov column,0 ; if CR, LF, or FF,
jmp clean5 ; reset column to zero
clean4: inc column ; if non-control character,
; increment column counter
clean5: call putc ; write char to stdout
jnc clean2 ; if disk not full,
; get another character
; write failed...
mov dx,offset msg2 ; DS:DX = error message
mov cx,msg2_len ; CX = message length
mov bx,stderr ; BX = standard error handle
mov ah,40h ; function 40h = write
int 21h ; transfer to MS-DOS
mov ax,4c01h ; function 4ch = terminate
; return code = 1
int 21h ; transfer to MS-DOS
clean6: mov ax,column ; tab code detected
cwd ; tabsiz - (column MOD tabsiz)
mov cx,tabsiz ; is number of spaces needed
idiv cx ; to move to next tab stop
sub cx,dx
add column,cx ; also update column counter
clean7: push cx ; save spaces counter
mov al,blank ; write an ASCII space
call putc
pop cx ; restore spaces counter
loop clean7 ; loop until tab stop
jmp clean2 ; get another character
clean8: call putc ; write EOF mark
clean9: call flush ; write last output buffer
mov ax,4c00h ; function 4ch = terminate
; return code = 0
int 21h ; transfer to MS-DOS
clean endp
getc proc near ; get character from stdin
; returns carry = 1 if
; end of input, else
; AL = char, carry = 0
mov bx,iptr ; get input buffer pointer
cmp bx,ilen ; end of buffer reached?
jne getc1 ; not yet, jump
; more data is needed...
mov bx,stdin ; BX = standard input handle
mov cx,bufsiz ; CX = length to read
mov dx,offset ibuff ; DS:DX = buffer address
mov ah,3fh ; function 3fh = read
int 21h ; transfer to MS-DOS
jc getc2 ; jump if read failed
or ax,ax ; was anything read?
jz getc2 ; jump if end of input
mov ilen,ax ; save length of data
xor bx,bx ; reset buffer pointer
getc1: mov al,[ibuff+bx] ; get character from buffer
inc bx ; bump buffer pointer
mov iptr,bx ; save updated pointer
clc ; return character in AL
ret ; and carry = 0 (clear)
getc2: stc ; end of input stream
ret ; return carry = 1 (set)
getc endp
putc proc near ; send character to stdout,
; returns carry = 1 if
; error, else carry = 0
mov bx,optr ; store character into
mov [obuff+bx],al ; output buffer
inc bx ; bump buffer pointer
cmp bx,bufsiz ; buffer full?
jne putc1 ; no, jump
mov bx,stdout ; BX = standard output handle
mov cx,bufsiz ; CX = length to write
mov dx,offset obuff ; DS:DX = buffer address
mov ah,40h ; function 40h = write
int 21h ; transfer to MS-DOS
jc putc2 ; jump if write failed
cmp ax,cx ; was write complete?
jne putc2 ; jump if disk full
xor bx,bx ; reset buffer pointer
putc1: mov optr,bx ; save buffer pointer
clc ; write successful,
ret ; return carry = 0 (clear)
putc2: stc ; write failed or disk full,
ret ; return carry = 1 (set)
putc endp
init proc near ; initialize input buffer
mov bx,stdin ; BX = standard input handle
mov cx,bufsiz ; CX = length to read
mov dx,offset ibuff ; DS:DX = buffer address
mov ah,3fh ; function 3fh = read
int 21h ; transfer to MS-DOS
jc init1 ; jump if read failed
mov ilen,ax ; save actual bytes read
init1: ret
init endp
flush proc near ; flush output buffer
mov cx,optr ; CX = bytes to write
jcxz flush1 ; exit if buffer empty
mov dx,offset obuff ; DS:DX = buffer address
mov bx,stdout ; BX = standard output handle
mov ah,40h ; function 40h = write
int 21h ; transfer to MS-DOS
flush1: ret
flush endp
_TEXT ends
_DATA segment word public 'DATA'
ibuff db bufsiz dup (0) ; input buffer
obuff db bufsiz dup (0) ; output buffer
iptr dw 0 ; ibuff pointer
ilen dw 0 ; bytes in ibuff
optr dw 0 ; obuff pointer
column dw 0 ; current column counter
msg1 db cr,lf
db 'clean: need MS-DOS version 2 or greater.'
db cr,lf,'$'
msg2 db cr,lf
db 'clean: disk is full.'
db cr,lf
msg2_len equ $-msg2
_DATA ends
STACK segment para stack 'STACK'
dw 64 dup (?)
STACK ends
end clean
Figure 15-3. CLEAN.ASM, the source code for the MASM version of the CLEAN filter.
/*
CLEAN.C Filter to turn document files into
normal text files.
Copyright 1988 Ray Duncan
Compile: C>CL CLEAN.C
Usage: C>CLEAN <infile >outfile
All text characters are passed through with high bit stripped
off. Formfeeds, carriage returns, and linefeeds are passed
through. Tabs are expanded to spaces. All other control codes
are discarded.
*/
#include <stdio.h>
#define TAB_WIDTH 8 /* width of a tab stop */
#define TAB '\x09' /* ASCII tab character */
#define LF '\x0A' /* ASCII linefeed */
#define FF '\x0C' /* ASCII formfeed */
#define CR '\x0D' /* ASCII carriage return */
#define BLANK '\x20' /* ASCII space code */
#define EOFMK '\x1A' /* Ctrl-Z end of file */
main(int argc, char *argv[])
{
char c; /* character from stdin */
int col = 0; /* column counter */
while((c = getchar()) != EOF) /* read input character */
{
c &= 0x07F; /* strip high bit */
switchÓ /* decode character */
{
case LF: /* if linefeed or */
case CR: /* carriage return, */
col=0; /* reset column count */
case FF: /* if formfeed, carriage */
wcharÓ; /* return, or linefeed, */
break; /* pass character through */
case TAB: /* if tab, expand to spaces*/
do wchar(BLANK);
while((++col % TAB_WIDTH) != 0);
break;
default: /* discard other control */
if(c >= BLANK) /* characters, pass text */
{ /* characters through */
wcharÓ;
col++; /* bump column counter */
}
break;
}
}
wchar(EOFMK); /* write end-of-file mark */
exit(0);
}
/*
Write a character to the standard output. If
write fails, display error message and terminate.
*/
wchar(char c)
{
if((putcharÓ == EOF) && (c != EOFMK))
{
fputs("clean: disk full",stderr);
exit(1);
}
}
Figure 15-4. CLEAN.C, the source code for the C version of the CLEAN filter.
When using the CLEAN filter, you must specify the source and destination files with redirection parameters in the command line; otherwise, CLEAN will simply read the keyboard and write to the display. For example, to filter the document file MYFILE.DOC and leave the result in the file MYFILE.TXT, you would enter the following command:
C>CLEAN <MYFILE.DOC >MYFILE.TXT <Enter>
(Note that the original file, MYFILE.DOC, is unchanged.)
One valuable application of this filter is to rescue assembly-language source files. If you accidentally edit such a source file in document mode, the resulting file may cause the assembler to generate spurious or confusing error messages. CLEAN lets you turn the source file back into something the assembler can cope with, without losing the time you spent to edit it.
Another handy application for CLEAN is to list a word-processed document in raw form on the printer, using a command such as
C>CLEAN <MYFILE.DOC >PRN <Enter>
Contrasting the C and assembly-language versions of this filter provides some interesting statistics. The C version contains 79 lines and compiles to a 5889-byte .EXE file, whereas the assembly-language version contains 265 lines and builds an 1107-byte .EXE file. The size and execution-speed advantages of implementing such tools in assembly language is obvious, even compared with such an excellent compiler as the Microsoft C Optimizing Compiler. However, you must balance performance considerations against the time and expense required for programming, particularly when a program will not be used very often.