Title: SUGI 28, PAPER 4
1 SUGI 28, PAPER 4
2Hand-Coded Direct-Addressing Routines Presented
At
SUGI 25 Paper 129 Paul M. Dorfman Private
Detectives in a Data Warehouse Key-Indexing,
Bitmapping, and Hashing SUGI 26 Paper 8 Paul
M. Dorfman Table Look-Up by Direct Addressing
Key-Indexing Bitmapping Hashing Paper 128
Paul M. Dorfman Quick Disk Table Look-up via
Hybrid Indexing into a Directly Addressed SAS
Data Set SUGI 27 Paper 12 Paul M. Dorfman and
Gregg P. Snell Hashing Rehashed
3SAS Version 9 DATA Step Object Associative Array
SUGI 27 Presentation Jason Secosky The DATA step
in Version 9 Whats New?
Associative Array
Hash
1
42
1
53
2
1
64
3
7data match array hkey (0999) _temporary_
do until ( eof1 ) set small end eof1
if missing (hkey(key)) then hkey(key)
s_sat end do until ( eof2 ) set
large end eof2 s_sat hkey(key)
if s_sat gt . then output end stop run
00
2
8data match array hkey (0999) _temporary_
do until ( eof1 ) set small end eof1
if missing (hkey(key)) then hkey(key)
s_sat end do until ( eof2 ) set
large end eof2 s_sat hkey(key)
if s_sat gt . then output end stop run
00
01
2
9data match array hkey (0999) _temporary_
do until ( eof1 ) set small end eof1
if missing (hkey(key)) then hkey(key)
s_sat end do until ( eof2 ) set
large end eof2 s_sat hkey(key)
if s_sat gt . then output end stop run
00
02
01
2
10data match array hkey (0999) _temporary_
do until ( eof1 ) set small end eof1
if missing (hkey(key)) then hkey(key)
s_sat end do until ( eof2 ) set
large end eof2 s_sat hkey(key)
if s_sat gt . then output end stop run
00
02
01
2
11data match array hkey (0999) _temporary_
do until ( eof1 ) set small end eof1
if missing (hkey(key)) then hkey(key)
s_sat end do until ( eof2 ) set
large end eof2 s_sat hkey(key)
if s_sat gt . then output end stop run
00
02
01
1
12LIMITATIONS
(9-Digit SSN Would Require 8 GB)
So you may be asking yourself...
How can one possibly use key-indexing when SSN
(or any other large or non-integer value) is the
key?
3
2
1
131
2
141
2
15GENERATIONS
Our Mission
data small
input key
s_sat
cards
185 00
971 11
400 22
260 33
922 44
970 55
543 66
532 77
050 88
067 99
run
(integer variable KEY)
(satellite variable S_SAT)
To unsorted LARGE
Assume
Which generation of hashing is the most efficient
for subsetting LARGE based on the values of KEY
in SMALL to produce a file MATCH?
7
6
5
4
3
2
1
16(Linear Probing)
h mod(key,hsize)
15
let load 0.625
02
09
05
data _null_ do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do
j2 to up until(not mod(p,j)) end end call
symput('hsize',left(put(p,best.))) stop set
small nobsp run put hsizehsize hash_size17
04
01
16
05
16
16
5
4
3
2
1
17GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
13
10
12
11
18GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00).
(00). (01). (01). (02).
(02). (03). (03). (04).
(04). (05). (05). (06).
(06). (07). (07). (08).
(08). (09). (09). (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16).
(16). (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
9
19GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00).
(00). (01). (01). (02).
(02). (03). (03). (04).
(04). (05). (05). (06).
(06). (07). (07). (08).
(08). (09). (09). (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15)185 (15)00 (16).
(16). (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
9
20GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00).
(00). (01). (01). (02)971
(02)11 (03). (03). (04).
(04). (05). (05). (06).
(06). (07). (07). (08).
(08). (09). (09). (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16).
(16). (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
9
21GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00).
(00). (01). (01). (02)971
(02)11 (03). (03). (04).
(04). (05). (05). (06).
(06). (07). (07). (08).
(08). (09)400 (09)22 (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16).
(16). (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
9
22GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00).
(00). (01). (01). (02)971
(02)11 (03). (03). (04).
(04). (05)260 (05)33 (06).
(06). (07). (07). (08).
(08). (09)400 (09)22 (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16).
(16). (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
9
23GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00).
(00). (01). (01). (02)971
(02)11 (03). (03). (04)922
(04)44 (05)260 (05)33 (06).
(06). (07). (07). (08).
(08). (09)400 (09)22 (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16).
(16). (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
9
24GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00).
(00). (01)970 (01)55 (02)971
(02)11 (03). (03). (04)922
(04)44 (05)260 (05)33 (06).
(06). (07). (07). (08).
(08). (09)400 (09)22 (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16).
(16). (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
9
25GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00).
(00). (01)970 (01)55 (02)971
(02)11 (03). (03). (04)922
(04)44 (05)260 (05)33 (06).
(06). (07). (07). (08).
(08). (09)400 (09)22 (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16)543
(16)66 (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
8
9
26GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00).
(00). (01)970 (01)55 (02)971
(02)11 (03). (03). (04)922
(04)44 (05)260 (05)33 (06).
(06). (07). (07). (08).
(08). (09)400 (09)22 (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16)543
(16)66 (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
6
5
7
27GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00)050
(00)88 (01)970 (01)55 (02)971
(02)11 (03). (03). (04)922
(04)44 (05)260 (05)33 (06).
(06). (07). (07). (08).
(08). (09)400 (09)22 (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16)543
(16)66 (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
3
2
1
4
28GENERATION I
let load 0.625
data _null_
do pceil(p/load) by 1
until(jup1) up ceil(sqrt(p)) do j2 to
up until(not mod(p,j)) end end call
symput('hsize', left(put(p,best.)))
stop set small nobsp run 0 if dupes to be
pulled let nodupes 1
data match (keepkey s_sat
l_sat)
retain nodupes nodupes..
array
hkey (0hsize) _temporary_
array hsat (0hsize)
_temporary_
...
HKEY HSAT (00)050
(00)88 (01)970 (01)55 (02)971
(02)11 (03)067 (03)99 (04)922
(04)44 (05)260 (05)33 (06).
(06). (07). (07). (08).
(08). (09)400 (09)22 (10).
(10). (11). (11). (12).
(12). (13). (13). (14).
(14). (15). (15). (16)543
(16)66 (17). (17).
0 if dupes to be pulled let nodupes 1
data match
(keepkey s_sat l_sat)
retain nodupes
nodupes..
array hkey(0hsize)
_temporary_
array hsat(0hsize) _temporary_
do
until ( eof1 )
set small end
eof1
do hmod (key, hsize) by
1
if h hsize then h 0
if
hkey(h)key and nodupes then leave
if hkey(h)
. then do
hkey(h) key
hsat(h) s_sat
leave
end
end
end
...
... do until ( eof2 )
set
large end eof2
do hmod (key,
hsize) by 1 until ( hkey(h) . )
if h hsize then h
0
if hkey(h) key then do
s_sat hsat(h)
output
if nodupes then leave
end
end
end
stop
run
295
4
3
2
1
30GENERATION II
3
2
1
31GENERATION II
data match ( drop rc )
length key
9 s_sat 8
declare AssociativeArray hh ()
rc hh.DefineKey ( 'key' )
rc
hh.DefineData ( 's_sat' )
rc hh.DefineDone
()
do until ( eof1 )
set small
end eof1
rc hh.add ()
end
do
until ( eof2 )
set large end
eof2
rc hh.find ()
if rc 0 then output
end
stop
run
data match
length key 9 s_sat 8
declare AssociativeArray hh ()
hh.DefineKey ( 'key' )
hh.DefineData
( 's_sat' )
hh.DefineDone ()
do until (
eof1 )
set small end eof1
hh.add ()
end
do until ( eof2 )
set large end eof2
if hh.find ()0 then output
end
stop
run
data match
length key 9 s_sat 8
declare AssociativeArray hh ()
hh.DefineKey ( 'key' )
hh.DefineData (
's_sat' )
hh.DefineDone ()
do until ( eof1 )
set small end eof1
hh.add ()
end
do until ( eof2 )
set large end eof2
if hh.find
()0 then output
end
stop