Where spelt is a fileref for the file containing the text to be spell checked and words is a libref pointing to the SAS library where the spell checker list is located. The suggest option lists spelling suggestions from the dictionary.
Verify is the default and is assumed unless a custom catalog is being created or updated with Proc SPELL.
To be more useful, a custom dictionary of industry relevant words that are not in SAS' dictionary can be created. This dictionary is a SAS catalog entry and is created with the spell procedure. To create a dictionary:
Create a text file with the upper case words that you want to define.
Put each word on a separate line.
Point to the SAS file that holds the dictionary catalog (if updating) or create a new catalog.
Point to the location of the custom word list.
filename Spelt '<file-specification>';
PROC Spell in = Spelt
dictionary = Words.spell.mywords
verify
suggest;
run;
//***********************************************
filename textdata 'c:\text.txt';
PROC Spell in = textdata
verify;
run;
//***********************************************
The code below creates the custom word catalog. This newly created catalog includes those entries in the SAS catalog in addition to the added custom words. To add words to a custom catalog change create to update.
PROC Spell words = "e:sasfolder\condition.txt"
create
dict = work.mycatalog.spell;
run;
//*********************************************
/*** Using Proc Spell, find all combination***/
/*** of letters that compose a valid word composed ***/
/*** of 3 or 4 letters from a phone number. ***/
%let myphone = 8 6 0 6 7 3 9 2 7 8; /*** 10 digit phone number ***/
/*** with spaces between each number as delimiters ***/
Data A (Keep = L) ;
Array C (0:9, 4) $1. _Temporary_
(' ' ' ' ' ' ' '
' ' ' ' ' ' ' '
'A' 'B' 'C' ' '
'D' 'E' 'F' ' '
'G' 'H' 'I' ' '
'J' 'K' 'L' ' '
'M' 'N' 'O' ' '
'P' 'Q' 'R' 'S'
'T' 'U' 'V' ' '
'W' 'X' 'Y' 'Z');
/*** 0 1 2 3 4 5 6 7 8 9 ***/
Array T (0:9) _Temporary_ (1 1 3 3 3 3 3 4 3 4) ;
/*** my phone number ***/
Array P (0:9) P0 - P9 (&myphone);
Length L $12. ;
Do X0=1 To T(P0) ; SubStr(L, 1,1) = C(P0,X0) ;
Do X1=1 To T(P1) ; SubStr(L, 2,1) = C(P1,X1) ;
Do X2=1 To T(P2) ; SubStr(L, 3,1) = C(P2,X2) ;
/*** create a space for position 4 ***/
Do X3=1 To T(P3) ; SubStr(L, 5,1) = C(P3,X3) ;
Do X4=1 To T(P4) ; SubStr(L, 6,1) = C(P4,X4) ;
Do X5=1 To T(P5) ; SubStr(L, 7,1) = C(P5,X5) ;
/*** create a space for position 8 ***/
Do X6=1 To T(P6) ; SubStr(L, 9,1) = C(P6,X6) ;
Do X7=1 To T(P7) ; SubStr(L,10,1) = C(P7,X7) ;
Do X8=1 To T(P8) ; SubStr(L,11,1) = C(P8,X8) ;
Do X9=1 To T(P9) ; SubStr(L,12,1) = C(P9,X9) ;
Output ;
End; End; End; End; End; End; End; End; End; End;
Run ;
/*** create an external bad word file for PROC SPELL ***/
data _null_;
file "testprocspell.txt";
set a;
put L;
run;
proc printto file="badwords.lis" new; run;
proc spell wordlist="testprocspell.txt"; run;
proc printto; run;
/*** read bad words created by proc spell ***/
data badwords;
infile 'badwords.lis' missover ;
input @1 badwords $ 30. ;
if badwords = ' ' then delete;
badwords = left(badwords);
if 0 ne index(badwords, 'File:') then delete;
if 0 ne index(badwords, 'Unrecognized') then delete;
run;
proc sort data=badwords out=badwords nodupkey; by badwords; run;
/*** now create a badword cleanup file ***/
data clnbadwd (drop=badwords);
length target $ 50 replace $ 50;
set badwords;
target = badwords; replace = ' ';
run;
/*** Using TIP 00128a (CLEANSEM.sas) ***/
%include 'cleanse_my.sas'; /*** my temporary version ***/
/*** need to check each schema for cyclical mappings ***/
/*** as well as create a global macro variable to get ***/
/*** number of records for each schema used ***/
/*** CHECK OUTPUT for any cyclical mapping reports ***/
%chkschem( schema=clnbadwd);
data clean;
set a;
%cleanse(schema=clnbadwd ,var=L );
if L ne ' '; /*** keep only non blank L ***/
run;
proc sort data=clean out=clean nodupkey; by L; run;
/*** now create an output file of just valid words ***/
data _null_;
set work.clean;
file "cleanwords.txt";
put L;
run;
/*** output from Clean Words Recognized by PROC SPELL
default Dictionary ***/
ORE
ORE PART
ORE PAST
ORE WART
PART
PAST
TO
TO PART
TO PAST
TO WART
TO ORE
TO ORE PART
TO ORE PAST
TO ORE WART
WART
/*** end of program ***/
Reference for PROC Spell: http://analytics.ncsu.edu/sesug/2007/SD06.pdf
For Practice part , next part to be continued in next blog-
No comments:
Post a Comment