/*** This SAS code gives some examples of how to use SAS to manipulate data and create sub-sets of data. It also give you some examples of graphing.***/ /*Import data which is an Excel Spreadsheet with variable names at the top.*/ PROC IMPORT OUT= salary_data DATAFILE= "Y://Users/rdecook/Desktop/salary_SAS_intro.xls" REPLACE; RUN; proc print data=salary_data (obs=10); run; /*Create a new variable which is the log transformation of salary and add it to the present data set called 'salary_data'.*/ data salary_data; set salary_data; logSAL=log(Salary_9_mo); run; proc print data=salary_data (obs=10); run; proc univariate data=salary_data plot; var logSAL; run; /*Get a frequency table for the number of faculty in each department.*/ proc freq data=salary_data; table department; run; /*Because there are a lot of departments, I chose to put the departments into 5 departmental categories. This is done with the 'if' and 'then' statements. The quotations are needed because Department is a class or nominal variable.*/ data salary_data; set salary_data; if Department="AGRON" or Department="AN S" or Department="NREM" or Department="FSHNA" or Department="FSHNF" or Department="PL P" or Department="HORT" or Department="ENT" then deptcat="AgSci"; if Department="BBMBA" or Department="BBMBS" or Department="GDCBA" or Department="GDCBS" or Department="EEOBA" or Department="EEOBS" then deptcat="BioSci"; if Department="PHYSA" or Department="CHEM" or Department="GE AT" then deptcat="PhysSci"; if Department="STAT" or Department="MATH" or Department="COM S" then deptcat="MathSci"; if Department="CH E" or Department="E CPE" or Department="IMSE" or Department="M S E" or Department="A B E" or Department="CCE E" or Department="M E" or Department="AER E" then deptcat="Eng"; proc freq data=salary_data; table deptcat; run; proc print data=salary_data (obs=30); run; /*Create separate data sets for the different ranks of professors. R1 are full, R2 are associate, and R3 and assistant professors.*/ data R1; set salary_data; if rank_code=1; data R2; set salary_data; if rank_code=2; data R3; set salary_data; if rank_code=3; run; /*Look at relationship between salary and 'Average contracts and grants' for each rank, use a different symbol for men and women (gender).*/ proc gplot data=R1; plot Salary_9_mo*Avg_Cont_Grants=gender; run; /*A couple outliers are making that plot not that useful.*/ data R1_noouts; set R1; if Avg_Cont_Grants<= 3000000; run; proc gplot data=R1_noouts; plot Salary_9_mo*Avg_Cont_Grants=gender; run; /************ Choose and define the symbols used **************/ symbol1 v=square i=none color=black; symbol2 v=star i=none color=red; symbol3 v=circle i=none color=green; symbol4 v=plus i=none color=black; proc gplot data=R1_noouts; plot Salary_9_mo*Avg_Cont_Grants=gender; title 'Full Professors'; run; proc gplot data=R2; plot Salary_9_mo*Avg_Cont_Grants=gender; title 'Associate Professors'; run; proc gplot data=R3; plot Salary_9_mo*Avg_Cont_Grants=gender; title 'Assistant Professors'; run; /************************************************************ *** General Linear Model: *** *** Fit a model to predict salary from other variables *** *************************************************************\ /*Put R1_noouts, R2 and R3 together as a single data set for analysis.*/ data finalset; set R1_noouts R2 R3; run; /*Run the analysis using the full model.*/ proc glm data=finalset; class gender deptcat Rank_Code; model Salary_9_mo = gender deptcat Rank_Code Avg_Cont_Grants/solution; run; /*Run the analysis using the full model and keep your output for checking diagnostics.*/ proc glm data=finalset; class gender deptcat Rank_Code; model Salary_9_mo = gender deptcat Rank_Code Avg_Cont_Grants/solution; output out=diagout p=preds r=resids; run; /*Diagnostics: Checking for constant variance. The '=4' here means I want the 4th symbol used.*/ proc gplot data=diagout; plot resids*preds=4; run; proc gplot data=diagout; plot resids*preds=deptcat; run; /*Looks like there's some violation of the assumption.*/ /*Diagnostics: Normality*/ proc univariate data=diagout normal plots; var resids; run; /*Which observations are the outliers?*/ data outliers1; set diagout; if resids<=-55000; keep Department Rank_Code Avg_Cont_Grants resids; run; proc print data=outliers1; run; data outliers2; set diagout; if resids>=55000; keep Department Rank_Code Avg_Cont_Grants resids; run; proc print data=outliers2; run; /* Exploring how salaries relate to gender,deptcat,rank?*/ proc sort data=finalset; by gender rank_code deptcat; proc means data=finalset; var Salary_9_mo; by gender; run; proc means data=finalset noprint; var Salary_9_mo; by gender rank_code deptcat; output out=salarymeans2 mean=m; run; proc sort data=salarymeans2; by deptcat rank_code; proc print data=salarymeans2; run; proc freq data=finalset; table rank_code*gender; run;