Saturday, 18 June 2016

PROC SPELL: The Spell of SAS PART -1

Where spelt is a fileref for the file containing the text to be spell checked and words is a libref pointing to the SAS library where the spell checker list is located. The suggest option lists spelling suggestions from the dictionary.
Verify is the default and is assumed unless a custom catalog is being created or updated with Proc SPELL.
To be more useful, a custom dictionary of industry relevant words that are not in SAS' dictionary can be created. This dictionary is a SAS catalog entry and is created with the spell procedure. To create a dictionary:
Create a text file with the upper case words that you want to define.
Put each word on a separate line.
Point to the SAS file that holds the dictionary catalog (if updating) or create a new catalog.
Point to the location of the custom word list.

filename Spelt '<file-specification>';

PROC Spell in         = Spelt 
           dictionary = Words.spell.mywords 
                        verify 
                        suggest;
run;
//***********************************************
filename textdata 'c:\text.txt';
PROC Spell in = textdata
           verify;
run;
//***********************************************
The code below creates the custom word catalog. This newly created catalog includes those entries in the SAS catalog in addition to the added custom words. To add words to a custom catalog change create to update.

PROC Spell words  = "e:sasfolder\condition.txt"
           create 
             dict = work.mycatalog.spell;
run;
//*********************************************
/***  Using Proc Spell, find all combination***/
/***    of letters that compose a valid word composed  ***/
/***    of 3 or 4 letters from a phone number.         ***/                                         

%let myphone = 8 6 0 6 7 3 9 2 7 8; /*** 10 digit phone number ***/
 /*** with spaces between each number as delimiters ***/

 Data A (Keep = L) ;
   Array C (0:9, 4) $1. _Temporary_
                        (' '  ' '  ' '  ' '
                         ' '  ' '  ' '  ' '
                         'A'  'B'  'C'  ' '
                         'D'  'E'  'F'  ' '
                         'G'  'H'  'I'  ' '
                         'J'  'K'  'L'  ' '
                         'M'  'N'  'O'  ' '
                         'P'  'Q'  'R'  'S'
                         'T'  'U'  'V'  ' '
                         'W'  'X'  'Y'  'Z');

  /***                       0 1 2 3 4 5 6 7 8 9 ***/
  Array T (0:9) _Temporary_ (1 1 3 3 3 3 3 4 3 4) ;
  /***                       my    phone number  ***/
  Array P (0:9)  P0 - P9    (&myphone);
  Length L  $12. ;

  Do X0=1 To T(P0) ;  SubStr(L, 1,1) = C(P0,X0) ;
  Do X1=1 To T(P1) ;  SubStr(L, 2,1) = C(P1,X1) ;
  Do X2=1 To T(P2) ;  SubStr(L, 3,1) = C(P2,X2) ;
/*** create a space for position 4 ***/
  Do X3=1 To T(P3) ;  SubStr(L, 5,1) = C(P3,X3) ;
  Do X4=1 To T(P4) ;  SubStr(L, 6,1) = C(P4,X4) ;
  Do X5=1 To T(P5) ;  SubStr(L, 7,1) = C(P5,X5) ;
/*** create a space for position 8 ***/
  Do X6=1 To T(P6) ;  SubStr(L, 9,1) = C(P6,X6) ;
  Do X7=1 To T(P7) ;  SubStr(L,10,1) = C(P7,X7) ;
  Do X8=1 To T(P8) ;  SubStr(L,11,1) = C(P8,X8) ;
  Do X9=1 To T(P9) ;  SubStr(L,12,1) = C(P9,X9) ;
     Output ;
  End; End; End; End; End; End; End; End; End; End;
Run ;

 /*** create an external bad word file for PROC SPELL ***/
 data _null_;
 file "testprocspell.txt";
 set a;
 put L;
 run;

 proc printto file="badwords.lis" new; run;
 proc spell wordlist="testprocspell.txt"; run;
 proc printto; run;

  /*** read bad words created by proc spell ***/
  data badwords;
   infile 'badwords.lis'      missover ;
   input @1 badwords $ 30. ;
   if badwords = ' ' then delete;
   badwords = left(badwords);
   if 0 ne index(badwords, 'File:') then delete;
   if 0 ne index(badwords, 'Unrecognized') then delete;
  run;

  proc sort data=badwords out=badwords nodupkey; by badwords; run;

  /*** now create a badword cleanup file ***/
  data clnbadwd (drop=badwords);
   length target $ 50 replace $ 50;
   set badwords;
   target = badwords; replace = ' ';
  run;

  /*** Using TIP 00128a (CLEANSEM.sas) ***/
  %include 'cleanse_my.sas'; /*** my temporary version ***/

  /*** need to check each schema for cyclical mappings  ***/
  /*** as well as create a global macro variable to get ***/
  /*** number of records for each schema used           ***/
  /*** CHECK OUTPUT for any cyclical mapping reports    ***/

  %chkschem( schema=clnbadwd);

 data clean;
  set a;
        %cleanse(schema=clnbadwd ,var=L       );
  if L ne ' '; /*** keep only non blank L ***/
 run;

 proc sort data=clean out=clean nodupkey; by L; run;

 /*** now create an output file of just valid words ***/
 data _null_;
  set work.clean;
  file "cleanwords.txt";
  put L;
 run;

/*** output from Clean Words Recognized by PROC SPELL 
     default Dictionary                                 ***/
ORE
ORE PART
ORE PAST
ORE WART
PART
PAST
TO
TO    PART
TO    PAST
TO    WART
TO  ORE
TO  ORE PART
TO  ORE PAST
TO  ORE WART
WART

/*** end of program ***/

Reference for PROC Spell: http://analytics.ncsu.edu/sesug/2007/SD06.pdf

For Practice part , next part to be continued in next blog-

No comments:

Post a Comment