type  extractor

pdf   PDF::OCR
doc   antiword
html  html2txt