A HTML to Text File Converter in Batch

This is a batch that cleanly converts Web documents into plain text files. You might have to convert the HTML file into DOS format by loading into EDIT then saving (or using other means) before converting if the source document is in UNIX format. This does not interpret codes like <br> or <p>, it goes only by the line breaks in the source. If it's one of those documents where everything's on one line, this won't work very well. The WORDWRAP batch can help if the file comes out with long lines.

QBasic must be on the path, it is supplied with MSDOS versions 5 and above. To use, simply enter:

   html2 infile.htm outfile.txt

If you want to preserve embedded http/ftp hyperlinks then use:

   html2 infile.htm outfile.txt /link

(substitute the actual file names, they don't have to be .htm and .txt)

HTML2.BAT removes all HTML tags that begin with a letter, ! or / then it converts &lt; &gt; &#60; &#62; &nbsp; &quot; &#38; &amp; and &middot; codes to their proper characters, hopefully resulting in readable text or working batch code. It gets a little flaky when mixing &#.. codes and other & codes when right next to each other, but I think I can live with that.

This code has been HTML-converted for proper display. To recover the code, copy/paste from the browser screen or get the UTILBATS collection.

 
:: HTML2.BAT - HTML to text converter (c)1996,1997 Terry Newton
:: Syntax:  html2 infile.htm outfile.ext [/link]
::  /link option leaves hypertext urls in text 
@echo off
if %2.==. goto done
if not exist %1 echo Input file not found
if not exist %1 goto done
echo.>%2
if not exist %2 echo Cannot create output file
if not exist %2 goto done
del %2
echo Working...
:: create temporary qbasic program...
echo> proc$.bas :on error goto closeout
echo>>proc$.bas open "%1" for input as #1
echo>>proc$.bas open "%2" for output as #2
::======== main loop - get input line ======
echo>>proc$.bas floop:line input #1,a$
::======== html code stripper ==============
echo>>proc$.bas q=1
echo>>proc$.bas qloop:p=instr(q,a$,"<"):if p=0 goto qend
echo>>proc$.bas q=p+1:if mid$(a$,q,1)="/" goto qconv
echo>>proc$.bas if mid$(a$,q,1)="!" goto qconv
echo>>proc$.bas if ucase$(mid$(a$,q,1))=lcase$(mid$(a$,q,1)) goto qloop
echo>>proc$.bas qconv:if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas p=instr(q,a$,">"):if p=0 goto qend
echo>>proc$.bas le=len(a$):if p=le then c$="" else c$=right$(a$,le-p)
if not %3.==/link. if not %3.==/LINK. goto nolink
echo>>proc$.bas d$=mid$(a$,q,p-q):e$=lcase$(d$):r1=0
echo>>proc$.bas r=instr(e$,"http://"):if r=0 then r=instr(e$,"ftp://")
echo>>proc$.bas if not(r=0) then r1=instr(r,e$,chr$(34))
echo>>proc$.bas if not(r1=0) then c$="("+mid$(d$,r,r1-r)+") "+c$
:nolink
echo>>proc$.bas a$=b$+c$:q=q-1:goto qloop
echo>>proc$.bas qend:
::======== convert "&lt;" to "<" =========
echo>>proc$.bas a:p=instr(lcase$(a$),"&lt;"):if p=0 goto b
echo>>proc$.bas if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas le=len(a$)-3:if p=le then c$="" else c$=right$(a$,le-p)
echo>>proc$.bas a$=b$+"<"+c$:goto a
::======== convert "&gt;" to ">" =========
echo>>proc$.bas b:p=instr(lcase$(a$),"&gt;"):if p=0 goto c
echo>>proc$.bas if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas le=len(a$)-3:if p=le then c$="" else c$=right$(a$,le-p)
echo>>proc$.bas a$=b$+">"+c$:goto b
::======== convert "&#60;" to "<" =========
echo>>proc$.bas c:p=instr(lcase$(a$),"&#60;"):if p=0 goto d
echo>>proc$.bas if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas le=len(a$)-4:if p=le then c$="" else c$=right$(a$,le-p)
echo>>proc$.bas a$=b$+"<"+c$:goto c
::======== convert "&#62;" to ">" =========
echo>>proc$.bas d:p=instr(lcase$(a$),"&#62;"):if p=0 goto e
echo>>proc$.bas if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas le=len(a$)-4:if p=le then c$="" else c$=right$(a$,le-p)
echo>>proc$.bas a$=b$+">"+c$:goto d
::======== convert "&nbsp;" to " " =========
echo>>proc$.bas e:p=instr(lcase$(a$),"&nbsp;"):if p=0 goto f
echo>>proc$.bas if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas le=len(a$)-5:if p=le then c$="" else c$=right$(a$,le-p)
echo>>proc$.bas a$=b$+" "+c$:goto e
::======== convert "&quot;" to " ===========
echo>>proc$.bas f:p=instr(lcase$(a$),"&quot;"):if p=0 goto g
echo>>proc$.bas if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas le=len(a$)-5:if p=le then c$="" else c$=right$(a$,le-p)
echo>>proc$.bas a$=b$+chr$(34)+c$:goto f
::======== convert "&#38;" to "&" =========
echo>>proc$.bas g:p=instr(lcase$(a$),"&#38;"):if p=0 goto h
echo>>proc$.bas if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas le=len(a$)-4:if p=le then c$="" else c$=right$(a$,le-p)
echo>>proc$.bas a$=b$+"&"+c$:goto g
::======== convert "&amp;" to "&" =========
echo>>proc$.bas h:p=instr(lcase$(a$),"&amp;"):if p=0 goto i
echo>>proc$.bas if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas le=len(a$)-4:if p=le then c$="" else c$=right$(a$,le-p)
echo>>proc$.bas a$=b$+"&"+c$:goto h
::======== convert "&middot;" to "·" =========
echo>>proc$.bas i:p=instr(lcase$(a$),"&middot;"):if p=0 goto wrline
echo>>proc$.bas if p=1 then b$="" else b$=left$(a$,p-1)
echo>>proc$.bas le=len(a$)-7:if p=le then c$="" else c$=right$(a$,le-p)
echo>>proc$.bas a$=b$+"·"+c$:goto i
::======== write line and loop ============
echo>>proc$.bas wrline:print #2,a$:goto floop
::======== done - close up ================
echo>>proc$.bas closeout:close #1:close #2:system
:: run the qbasic program...
qbasic /run proc$.bas
del proc$.bas
if exist %2 echo %2 text file created
if not exist %2 echo Something didn't work...
:done