/*Today, you will need: 1. This code 2. The transport file from the Census of Jails: 26602-0001-Data.stc 3. The file "JailsRecode.sas" from the course website */ LIBNAME mysaslib 'C:\Users\ekaizar\Documents\Teaching\6510\Saslib'; options fmtsearch = (mysaslib); *this line tells SAS where to look for formats; /*NOTE: Last time we did not deal with the "formats" in the SAS transport file. They were probably lautomatically loaded into your work library, which means that you don't have access to them any more! To get them back, you will need to re-do the import of the dataset. From now on, you will also need to tell SAS where to search for the formats.*/ /*--------------------------------------------------------------------*/ /* Part 1: Get the data, put it into your permanent library */ /*--------------------------------------------------------------------*/ *read the sas dataset from the "transport" file into my permanent library; *b); proc cimport infile = 'C:\Users\ekaizar\Documents\Teaching\6510\ICPSR_26602\ICPSR_26602\DS0001\26602-0001-Data.stc' library = mysaslib ; run ; *c); *rename the dataset to my permanent library; *also recode the mising values from '-1' to '.' using the commands in JailsRecode.sas; data mysaslib.jails; set mysaslib.Da26602p1; %INCLUDE 'C:\Users\ekaizar\Documents\Teaching\6510\JailsRecode.sas'; *include tells sas to submit all the lines in this file; run; /*--------------------------------------------------------------------*/ /* Part 2: Create Strata */ /*--------------------------------------------------------------------*/ data mysaslib.jails; set mysaslib.jails; *stratify by number of facilities; if V20=1 then size="small"; else if V20=2 then size="medium"; else size="large"; *stratify by region, using census regions; *northeast; if V7 in ('ME' 'NH' 'VT' 'MA' 'RI' 'CT' 'NY' 'PA' 'NJ' 'PR') then region="northeast"; *midwest; else if V7 in ('WI' 'MI' 'IL' 'IN' 'OH' 'MO' 'ND' 'SD' 'NE' 'KS' 'MN' 'IA') then region="midwest"; *south; else if V7 in ('DE' 'MD' 'DC' 'VA' 'WV' 'NC' 'SC' 'GA' 'FL' 'KY' 'TN' 'MS' 'AL' 'OK' 'TX' 'AR' 'LA') then region="south"; *west; else if V7 in ('ID' 'MT' 'WY' 'NV' 'UT' 'CO' 'AZ' 'NM' 'AK' 'WA' 'OR' 'CA' 'HI') then region="west"; else region="misclassified"; run; *check the stratificaiton; proc freq data=mysaslib.jails; tables size region; run; /*--------------------------------------------------------------------*/ /* Part 3: Create a population with no missing values */ /* in the variables of interest */ /*--------------------------------------------------------------------*/ data mysaslib.jailpop; set mysaslib.jails; if V18 ~= .; if V24 ~= .; if V48 ~= .; run; /*--------------------------------------------------------------------*/ /* Part 4: Take a Stratified Random Sample using size */ /*--------------------------------------------------------------------*/ *first we must SORT by size; proc sort data=mysaslib.jailpop; by size; run; /*a)---------------*/ *stratify by size, select 15 from each stratum; proc surveyselect data=mysaslib.jailpop method=srs n=15 /*seed=1953*/ out=mysaslib.jail_size_15; strata size; run; *check out how it worked; proc freq data=mysaslib.jail_size_15; table size; run; /*b)---------------*/ *stratify by size, select a total of 45, allocated proportionately to each stratum; proc surveyselect data=mysaslib.jailpop method=srs n=45 /*seed=1953*/ out=mysaslib.jail_size_prop; strata size /alloc=prop; run; *check out how it worked; proc freq data=mysaslib.jail_size_prop; table size; run; proc freq data=mysaslib.jailpop; table size; run; *note that we have a problem because we don't want any stratum with only one unit sampled!; *manually fix this; *output the sample sizes to a dataset; proc surveyselect data=mysaslib.jailpop method=srs n=45 /*seed=1953*/ out=mysaslib.sizeprop; strata size /alloc=prop nosample; run; *move one sampled value from the small stratum to the large stratum; data mysaslib.sizeprop; set mysaslib.sizeprop; if size="large" then SampleSize=SampleSize+1; if size="small" then SampleSize=SampleSize-1; keep size SampleSize; run; *re-sample with these sizes!; proc surveyselect data=mysaslib.jailpop method=srs n=mysaslib.sizeprop /*seed=1953*/ out=mysaslib.jail_size_prop; strata size; run; *check out how it worked; proc freq data=mysaslib.jail_size_prop; table size; run; /*c)---------------*/ *stratify by size, select a total of 45, allocated proportionately to each stratum, but adjusted for predicted variability (Neyman); *suppose the variance for the #inmates variable is: small: [(range ~ 0 to 800)/4]^2 = 40,000 medium: [(50 to 2050)/4]^2 = 250,000 large: [(500 to 10500)/4]^2 = 6,250,000; proc surveyselect data=mysaslib.jailpop method=srs n=45 /*seed=1953*/ out=mysaslib.jail_size_neyman; strata size /alloc=neyman var=(6250000, 250000, 40000); *note the variance is specified in alphabetical order; run; *check out how it worked; proc freq data=mysaslib.jail_size_neyman; table size; run; *note that Neyman allocates more to large b/c more variable!; *no need to adjust; /*d)---------------*/ *stratify by size, select a total of 45, allocated proportionately to each stratum, but adjusted for predicted variability and cost (optimal); *suppose the variance for the #inmates variable is: small: [(range ~ 0 to 800)/4]^2 = 40,000 medium: [(50 to 2050)/4]^2 = 250,000 large: [(500 to 10500)/4]^2 = 6,250,000; *suppose the cost for sampling is: small: 1 medium: 2 large: 3; proc surveyselect data=mysaslib.jailpop method=srs n=45 /*seed=1953*/ out=mysaslib.jail_size_opt; strata size /alloc=optimal var=(6250000, 250000, 40000) cost=(3, 2, 1); *note the variance and cost is specified in alphabetical order; run; *check out how it worked; proc freq data=mysaslib.jail_size_opt; table size; run; *note that adding cost reduces the number of large b/c very expensive; /*e)---------------*/ /*note: you can also specify the variance and cost in a dataset*/ *the variable name for the variance MUST BE '_VAR_'; *the variable name for the cost MUST BE '_COST_'; data mysaslib.sizevarcost; input size$ _VAR_ _COST_; datalines; small 40000 1 medium 250000 2 large 6250000 3 ; run; proc sort data=mysaslib.sizevarcost; by size; run; proc surveyselect data=mysaslib.jailpop method=srs n=45 /*seed=1953*/ out=mysaslib.jail_size_opt; strata size /alloc=optimal var=mysaslib.sizevarcost cost=mysaslib.sizevarcost; run; *check out how it worked; proc freq data=mysaslib.jail_size_opt; table size; run; /*--------------------------------------------------------------------*/ /* Part 5: Estimate Using Stratified Random Sample */ /*--------------------------------------------------------------------*/ *First, we must make a dataset that will tell SAS how many units are in each stratum (N_h); *we can use surveyselect to do this; *I copied this proc statement from above, changed the out= name, and added the "nosample"; proc surveyselect data=mysaslib.jailpop method=srs n=45 /*seed=1953*/ out=mysaslib.sizeneyman; strata size /alloc=neyman var=(6250000, 250000, 40000) nosample; run; data mysaslib.sizeneyman; set mysaslib.sizeneyman; _TOTAL_ = TOTAL; *add the underscores so surveymeans can find it; keep size _TOTAL_; *get rid of the extra variables; run; *a. ybarU; *TRUE VALUE; proc means data=mysaslib.jailpop; var V18; run; *Estimate with Neyman Allocation; *SAS uses the weight-based formulas, so we must include the weight for each unit in the sample!; proc surveymeans data=mysaslib.jail_size_neyman N=mysaslib.sizeneyman; /*if you put a number here, SAS assumes all strata are same size*/ stratum size; weight SamplingWeight; /*weight = n_h/N_h, calculated by surveyselect*/ var V18; run; *b. p; *TRUE VALUE; proc freq data=mysaslib.jailpop; table V24; run; *Estimate with Neyman Allocation; proc surveyfreq data=mysaslib.jail_size_neyman N=mysaslib.sizeneyman; stratum size; weight SamplingWeight; /*weight = n_h/N_h, calculated by surveyselect*/ table V24/CL; run; *c. t; *TRUE VALUE; proc univariate data=mysaslib.jailpop; var V48; ods select moments; run; *Estimate with Neyman Allocation; proc surveymeans data=mysaslib.jail_size_neyman N=mysaslib.sizeneyman SUM CLSUM; stratum size; weight SamplingWeight; /*weight = n_h/N_h, calculated by surveyselect*/ var V48; run;