Macros for SAS Application Developers
https://github.com/sasjs/core
Loading...
Searching...
No Matches
mp_hashdirectory.sas
Go to the documentation of this file.
1/**
2 @file
3 @brief Returns a unique hash for each file in a directory
4 @details Hashes each file in each directory, and then hashes the hashes to
5 create a hash for each directory also.
6
7 This makes use of the new `hashing_file()` and `hashing` functions, available
8 since 9.4m6. Interestingly, those functions can be used in pure macro, eg:
9
10 %put %sysfunc(hashing_file(md5,/path/to/file.blob,0));
11
12 Actual usage:
13
14 %let fpath=/some/directory;
15
16 %mp_hashdirectory(&fpath,outds=myhash,maxdepth=2)
17
18 data _null_;
19 set work.myhash;
20 put (_all_)(=);
21 run;
22
23 Whilst files are hashed in their entirety, the logic for creating a folder
24 hash is as follows:
25
26 @li Sort the files by filename (case sensitive, uppercase then lower)
27 @li Take the first 100 hashes, concatenate and hash
28 @li Concatenate this hash with another 100 hashes and hash again
29 @li Continue until the end of the folder. This is the folder hash
30 @li If a folder contains other folders, start from the bottom of the tree -
31 the folder hashes cascade upwards so you know immediately if there is a
32 change in a sub/sub directory
33 @li If a subfolder has no content (empty) then it is ignored. No hash created.
34 @li If the file is empty, it is also ignored / no hash created.
35 @li If the target directory (&inloc) is empty, &outds will also be empty
36
37 <h4> SAS Macros </h4>
38 @li mp_dirlist.sas
39
40 <h4> Related Files </h4>
41 @li mp_hashdataset.sas
42 @li mp_hashdirectory.test.sas
43 @li mp_md5.sas
44
45 @param [in] inloc Full filepath of the file to be hashed (unquoted)
46 @param [in] iftrue= (1=1) A condition under which the macro should be executed
47 @param [in] maxdepth= (0) Set to a positive integer to indicate the level of
48 subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited
49 recursion, set to MAX.
50 @param [in] method= (MD5) the hashing method to use. Available options:
51 @li MD5
52 @li SH1
53 @li SHA256
54 @li SHA384
55 @li SHA512
56 @li CRC32
57 @param [out] outds= (work.mp_hashdirectory) The output dataset. Contains:
58 @li directory - the parent folder
59 @li file_hash - the hash output
60 @li hash_duration - how long the hash took (first hash always takes longer)
61 @li file_path - /full/path/to/each/file.ext
62 @li file_or_folder - contains either "file" or "folder"
63 @li level - the depth of the directory (top level is 0)
64
65 @version 9.4m6
66 @author Allan Bowe
67**/
68
69%macro mp_hashdirectory(inloc,
70 outds=work.mp_hashdirectory,
71 method=MD5,
72 maxdepth=0,
73 iftrue=%str(1=1)
74)/*/STORE SOURCE*/;
75
76%local curlevel tempds maxlevel;
77
78%if not(%eval(%unquote(&iftrue))) %then %return;
79
80/* get the directory listing */
81%mp_dirlist(path=&inloc, outds=&outds, maxdepth=&maxdepth, showparent=YES)
82
83/* create the hashes */
84data &outds;
85 set &outds (rename=(filepath=file_path));
86 length FILE_HASH $32 HASH_DURATION 8;
87 keep directory file_hash hash_duration file_path file_or_folder level;
88
89 ts=datetime();
90 if file_or_folder='file' then do;
91 /* if file is empty, hashing_file will break - so ignore / delete */
92 length fname val $8;
93 drop fname val fid is_empty;
94 rc=filename(fname,file_path);
95 fid=fopen(fname);
96 if fid > 0 then do;
97 rc=fread(fid);
98 is_empty=fget(fid,val);
99 end;
100 rc=fclose(fid);
101 rc=filename(fname);
102 if is_empty ne 0 then delete;
103 else file_hash=hashing_file("&method",cats(file_path),0);
104 end;
105 hash_duration=datetime()-ts;
106run;
107
108proc sort data=&outds ;
109 by descending level directory file_path;
110run;
111
112%let maxlevel=0;
113data _null_;
114 set &outds;
115 call symputx('maxlevel',level,'l');
116 stop;
117run;
118
119/* now hash the hashes to populate folder hashes, starting from the bottom */
120%do curlevel=&maxlevel %to 0 %by -1;
121 data work._data_ (keep=directory file_hash);
122 set &outds;
123 where level=&curlevel;
124 by descending level directory file_path;
125 length str $32767 tmp_hash $32;
126 retain str tmp_hash ;
127 /* reset vars when starting a new directory */
128 if first.directory then do;
129 str='';
130 tmp_hash='';
131 i=0;
132 end;
133 /* hash each chunk of 100 file paths */
134 i+1;
135 str=cats(str,file_hash);
136 if mod(i,100)=0 or last.directory then do;
137 tmp_hash=hashing("&method",cats(tmp_hash,str));
138 str='';
139 end;
140 /* output the hash at directory level */
141 if last.directory then do;
142 file_hash=tmp_hash;
143 output;
144 end;
145 if last.level then stop;
146 run;
147 %let tempds=&syslast;
148 /* join the hash back into the main table */
149 proc sql undo_policy=none;
150 create table &outds as
151 select a.directory
152 ,coalesce(b.file_hash,a.file_hash) as file_hash
153 ,a.hash_duration
154 ,a.file_path
155 ,a.file_or_folder
156 ,a.level
157 from &outds a
158 left join &tempds b
159 on a.file_path=b.directory
160 order by level desc, directory, file_path;
161 drop table &tempds;
162%end;
163
164%mend mp_hashdirectory;