######################################## ## Making plots using ggplot ## 6th TRUG presentation, February 2014 ## Karel Kroeze ######################################## # Load some data and do some basic cleanup setwd(“C:/users/karel/desktop”) data <- read.csv(“data.csv”,sep=’;') names(data) <- c(“leerling.id”, “sex”, “dob”, “weight”, “leerjaar”, “ToetsInstr”, “ToetsME”, “datum”, “score”, “tijdvak”, “BRIN”, “tijdstip”) summary(data) data$leerling.id <- factor(data$leerling.id) data$sex <- factor(data$sex) data$leerjaar <- factor(data$leerjaar) data$tijdstip <- factor(data$tijdstip) # Do some descriptives the old-fashioned way attach(data) hist(score) boxplot(score~sex,data=data) boxplot(score~BRIN+sex,data=data,col=c(‘blue’,'red’)) boxplot(score~leerjaar,data=data) plot(score~as.numeric(leerjaar)) abline(lm(score~as.numeric(leerjaar),data=data),col=’red’) lines(lowess(score~leerjaar),col=’blue’) par(mfrow=c(1,2)) hist(subset(score,sex==0)) hist(subset(score,sex==1)) par(mfrow=c(1,1)) # Using lattice require(lattice) splom(data[,c('sex','leerjaar','score')]) # We can change a lot, but that requires creating custom panel functions # Using ggplot require(ggplot2) # Pie example pie.data <- data.frame(sex=factor(0:1,labels=c(“Male”,”Female”)),cat=as.factor(sample(1:4,500,replace=T))) # Create some data pie <- ggplot(data=pie.data,aes(x=factor(1),fill=cat)) # Set up the basic data in a ggplot object pie + geom_bar(stat=”bin”,position=”stack”,width=1) # Add a stackedbar chart plot pie + geom_bar(stat=”bin”,position=”stack”,width=1) + coord_polar(theta=”y”) # add a polar coordinate set pie + geom_bar(stat=”bin”,position=”stack”,width=1) + coord_polar(theta=”y”) + facet_wrap(~sex) # add a facet wrap # data example plot <- ggplot(data, aes(x=leerjaar,y=score)) # set up the data in a ggplot object # Try out a lot of the basic graphics plot + geom_point() # scatter plot + geom_boxplot() # box plot + geom_point() + facet_grid(BRIN~sex) # facetted box plot + geom_boxplot() + aes(colour=sex) plot + geom_boxplot() + aes(colour=sex) + facet_grid(BRIN~.) plot + geom_point() + aes(colour=sex) + facet_grid(.~BRIN) plot + geom_point() + aes(colour=BRIN) + facet_grid(.~sex) plot + geom_point(alpha=.5) + facet_grid(.~BRIN) + geom_smooth(method=’lm’,SE=T, aes(group=sex, colour=sex)) plot + geom_point(alpha=.5) + facet_grid(.~BRIN) + geom_smooth(method=’lm’,SE=T) + aes(group=sex, colour=sex) plot <- plot + geom_point(alpha=.5) + facet_grid(.~BRIN) + geom_smooth(method=’lm’,SE=T) + aes(group=sex, colour=sex) plot plot + scale_x_discrete(“Leerjaar”) + scale_y_continuous(“Vaardigheidsscore”) + scale_colour_discrete(“Sexe”) # Can’t change labels on facets easily, easier to change in data. (there are complicated ways of ‘hacking’ into the object and changing the labels) levels(data$sex) <- c(“Boy”,”Girl”) levels(data$BRIN) <- c(“School A”,”School B”,”School C”,”School D”,”School E”) plot <- ggplot(data, aes(x=leerjaar,y=score,group=sex,colour=sex))+ geom_point(alpha=.5)+ facet_grid(.~BRIN)+ geom_smooth(method=’lm’, SE=T)+ scale_x_discrete(“Leerjaar”)+ scale_y_continuous(“Vaardigheidsscore”)+ scale_colour_discrete(“Sekse”) plot plot + geom_text(data=NULL, label=”Annotation”, x=4, y=160, show_guide=F) # annotations do not work nicely with facets, because geom_text is really just a function like point, line, etc. # Example of how geom_text is meant. ex <- data.frame(x = runif(10), y = runif(10), label = c(‘One’,'Two’,'Three’,'Four’,'Five’,'Six’,'Seven’,'Eight’,'Nine’,'Ten’)) ex.plot <- ggplot(ex,aes(x=x,y=y)) + geom_point() ex.plot ex.plot + geom_text(aes(label=label),hjust=0,vjust=0) plot + geom_text(aes(label=lab), # To have different annotations per facet we need a data.frame with labels for each facet variable AND group, colour, etc. data=data.frame(BRIN=levels(data$BRIN), lab=c(‘**’,”,”,’***’,'*’), leerjaar=4, score=160, sex=’Boy’), show_guide=F) require(grid) # Let’s try some arrows… plot + geom_segment(aes(x=2,y=150,xend=3,yend=125),arrow = arrow(length=unit(0.5,’cm’))) # same problem plot + geom_segment(aes(x=x,y=y,xend=xend,yend=yend), # Same solution, with some playing around. We don’t need an arrow in all facets, so no arrow in schools A and B. arrow=arrow(length=unit(.5,’cm’)), colour=’black’, # Set the colour, can also be a vector of colors, or a variable (similar to aesthetic.) data=data.frame(BRIN=levels(data$BRIN)[c(3:5,5)], # School C-D, D twice. sex=c(‘Boy’,'Girl’,'Boy’,'Girl’), # Different genders lead to different colors. x=c(1,2,2,3), y=c(150,100,125,125), # Make sure to give an equal number of values on all variables. xend=c(2,3,3,4), yend=c(100,100,100,100))) # The good news of this is that you can easily mark points of interest… # first we add a column to the data.frame data$annotations <- ” # Let’s say one person is of notable interest, let’s call him subject 51. unique(data$leerling.id)[51] # Subject 101494 will be our experimental victim. data$annotations[which(data$leerling.id=='101268')] <- “Pay attention to this guy” # Now we have an annotations variable with notes for data points of interest # annotations is new, so define plot again plot <- ggplot(data, aes(x=leerjaar,y=score,group=sex,colour=sex))+ geom_point(alpha=.5)+ facet_grid(.~BRIN)+ geom_smooth(method=’lm’, SE=T)+ scale_x_discrete(“Leerjaar”)+ scale_y_continuous(“Vaardigheidsscore”)+ scale_colour_discrete(“Sexe”) plot + geom_text(aes(label=annotations), size=4, colour=’black’) # All points for subject 51 are now annotated. Obviously, you could propbably have more useful annotations. # You can draw arrows to these datapoints too… plot + geom_segment(aes(xend=leerjaar, yend=score, x=as.numeric(leerjaar)-1, y=score+25), arrow=arrow(length=unit(.3,’cm’)), data=data[which(data$annotations!=''),], # we do have to limit the dataset now, or all points will get an arrow. colour=’black’) # And finally we can combine the two, just annotating subject 51′s score. plot + geom_segment(aes(xend=leerjaar, yend=score, x=as.numeric(leerjaar)-1, y=score+25), arrow=arrow(length=unit(.3,’cm’)), data=data[which(data$annotations!=''),], # we do have to limit the dataset now, or all points will get an arrow. colour=’black’) + geom_text(aes(label=score, x=as.numeric(leerjaar)-1, y=score+25), hjust=1, vjust=0, size=4, colour=’black’, data=data[which(data$annotations!=''),]) # As you might imagine, manually setting annotations for plots with two facet variables and different grouping and colouring variables can become a hassle. # Another example…. ti <- read.csv(“titanic3.csv”) ti$pclass <- factor(ti$pclass,labels=c(‘first’,'second’,'third’)) ti$survived <- as.factor(ti$survived) ti.glm <- glm(survived~age+sex+pclass, family=’binomial’, data=ti) ti$odds <- predict.glm(ti.glm,newdata=ti,type=’response’,se=F) ggplot(ti, aes(x = age, y = odds, colour=pclass)) + # Age on x axis, predicted odds on y. Create groups by class, and colour them separately facet_wrap(~sex) + # Create seperate plots by sex stat_smooth(method=’glm’, formula=y~x, family=’binomial’) + labs(title=”Predicted odds of survival\nby sex, class and age”, x=’Age’, y=’Odds of survival’) + # Set labels for axis and main title scale_colour_discrete(name=”Passenger Class”) # Set label for legend. # We can also easily show weights… ex$weight <- runif(10,1,5) ggplot(ex,aes(x,y)) + geom_point(aes(size=weight)) # Use alpha levels to better display crowded scatterplots… ex <- data.frame(x=rnorm(50000),y=rnorm(50000)) ggplot(ex,aes(x,y)) + geom_point(alpha=.1) # or use a (hex)binplot ggplot(ex,aes(x,y)) + geom_hex() # There are thousands of applications, an many sources of help (ggplot2 is all the rage right now, there’s plenty of examples and tutorials to be found) ###### examples from the presentation:######## ### first some new data; require(mvtnorm) # require for multivariate normal cov.mat <- matrix(c(1,.5,.3,.5,1,0,.3,0,1),ncol=3) # covariance matrix for an x and 2 y variables, where x covaries with both y’s but the y’s are independent. ex <- rmvnorm(5000, # generate 5000 rows mean=c(0,0,5), # differing means for both y’s sigma=cov.mat) # specify the covariance matrix cov(ex) # reality check, compute covariance matrix. cov.mat # close enough. ex <- data.frame(ex, # cast the data in a data.frame cat=1:5, # add some categories and gender data sex=1:2) colnames(ex) <- c(‘x’,'y1′,’y2′,’cat’,'sex’) ex$sex <- factor(ex$sex) ### Dual y axis; contrary to what I said earlier, this is NOT possible. Reason being, the author of ggplot feels that using dual scales on one image is misleading, arbitrary, and just bad practice. # link to author’s comments on this; http://stackoverflow.com/questions/3099219/how-to-use-ggplot2-make-plot-with-2-y-axes-one-y-axis-on-the-left-and-another (bottom post by Hadley is the author) ggplot(ex) + geom_point(aes(x=x,y=y1),colour=’red’) + geom_point(aes(x=x,y=y2),colour=’blue’) # plotting variables with different scales. # There is one exception; transformations of data, i.e. mile/KM, Celcius/Fahrenheit. Plotting two pieces of data with different scales within the same panel is not possible. # What we can do is plot variables side by side, however then we first have to transform the data into ‘long form’ require(reshape2) mex <- melt(ex,id.vars=c(“x”,’cat’,'sex’)) # this ‘melts’ the variables in a data.set (except those marked as id.vars) into a two variable long form; a column for variable (y1 or y2) and a column for value. # We can now use the variable column to put plots side-by-side; ggplot(mex,aes(x=x,y=value)) + geom_point() + facet_wrap(~variable) # Both variables are now plotted on one y axis, this may or may not be desirable; ggplot(mex,aes(x=x,y=value)) + geom_point() + facet_wrap(~variable,scales=”free_y”) # options are fixed (defailt), free_y and free_x # And now we have separate scales for each plot. plot <- ggplot(mex,aes(x=x,y=value,colour=sex)) + geom_point() plot + facet_grid(.~variable, scales=”free”) # Note that when using facet_grid, y scales are the same for all plots horizontally, while x scales are the same vertically. plot + facet_grid(variable~.,scales=”free”) # this does work plot + facet_wrap(~variable, scales=”free”) # facet_wrap does allow different x scales on the same row. ### Wrap up; It’s not (easily) possible to put two different x or y scales on the same plot, unless the second scale is a one-to-one transformation of the first scale. # If you really want to do this in ggplot, here’s a link to get you started; http://rpubs.com/kohske/dual_axis_in_ggplot2 ### Plotting score by average score for that group # Let’s take a sample out of the dataset to get some more fluctuation in our data and summaries; ex <- ex[sample(nrow(ex),100),] # select 100 people out of the original 5000, taken from the original data (since we ‘split’ observations to get y1 and y2 on separate rows) mex <- melt(ex,id.vars=c(“x”,’cat’,'sex’)) # back to long form, splitting the observations again. # First off, just adding group means to a plot is quite simple; plot <- ggplot(mex,aes(x=cat,y=value,colour=sex)) + geom_point() + facet_wrap(~variable, scales=”free”) plot plot + stat_summary(fun.y=mean, geom=’point’, size=3, colour=’red’) # stat_summary is quite versatile, and can take functions on y values and the data frame (fun.y, fun.ymax, fun.ymin, fun.data) # You can write your own functions for summary statistcs, or grab some from other packages, e.g. Hmisc; require(Hmisc) plot + stat_summary(fun.data=mean_cl_normal, geom=’errorbar’) # you might prefer to do your own summary statistics, and plot those through the ‘normal’ geometric elements, giving you more control over what gets plotted. summary <- ddply(mex, # ddply takes a data frame, splits it into subsets applies a function to each and combines the results back into a data frame. ~sex+cat+variable, # subsets for each combination of sex cat and y var (since those are the levels in our plot)) summarise, # function to apply is summarise, which itself is a ‘meta-function’ to perform various operations on a data.set mean=mean(value), # extra argumenst for summarise, what should it return? se=sd(value)/sqrt(length(value)) # mean + standard error ) # we can now plot this data in it’s own layer of the plot; plot + geom_errorbar(data=summary,aes(y=mean,ymin=mean-se,ymax=mean+se)) # with error bars plot + geom_smooth(data=summary,aes(y=mean,ymin=mean-se,ymax=mean+se),stat=”identity”) # with a self-set smoothener, showing a confidence interval plot + geom_smooth() # note that just adding a smoothener will also calculate some form of confidence interval, but you have less control over this # there are many more options, I’ll leave that to your imagination. ### However, the actual question was on how to plot score by mean score. I’m not sure if this can be done from within ggplot directly, (it probably can), but adding group level information to each datapoint is quite easy. # we already have the required summary stats, we just need to merge the data into the main data frame mexplus <- merge(mex,summary,by=c(‘sex’,'cat’,'variable’)) # merge summary and mex, using columns sex cat and variable as unique indices. # we can now use these variables as in any other plot; ggplot(mexplus,aes(x=mean,y=value,colour=sex,group=sex)) + facet_wrap(~variable,scale=’free’) + geom_point() ### Finally, there was a question on how to set axis limits plot <- ggplot(ex,aes(x=cat,y=y1)) + geom_point() plot plot + ylim(0,10) # the easiest way, a wrapper for setting limits, but note that this throws away data! If you use statistical transforms, this means your estimates are no longer correct! plot + stat_summary(fun.y=mean,geom=’point’,size=5,colour=’red’) # means are above and below 0 plot + stat_summary(fun.y=mean,geom=’point’,size=5,colour=’red’) + ylim(0,10) # means are all above 0! plot + scale_y_continuous(limits=c(0,10)) # scale_LEVEL_MEASUREMENT, where level is usually either x,y,colour or group and measurement discrete or continuous # This is the proper function to change scale aspects, and we can change a lot more here, but setting a limit here will still throw away data. plot + coord_cartesian(ylim = c(0,10)) # limits the viewport, but does not throw away data. plot + coord_cartesian(ylim = c(0,10)) + stat_summary(fun.y=mean,geom=’point’,size=5,colour=’red’) # Proof of the pudding. # note that changing the limits of facetted plots is something that is harder to do; plot <- ggplot(mex,aes(x=x,y=value), colour=sex, group=sex) + geom_point() plot + facet_grid(variable~cat) plot + facet_grid(variable~cat, scale=’free’) # sets the scales to the data plot + facet_grid(variable~cat) + ylim(-2,2) # changes the limits on all facets. # we cannot change any specific facet directly. # what we can do, is add ‘dummy data’ to increase the scale of a plot. dummy.data <- data.frame(x=c(-100,100),y1=c(-10,10)) # create two data points with x ranges of -100 and 100, and y of -10 and 10. ggplot(ex,aes(x=x,y=y1)) + geom_point() + geom_blank(data=dummy.data) # plot these data points in a blank geom layer, this increases the size of the plot, and thereby the scale. # with some clever dummy data, we can then also increase the scale for individual facets (if free scales are on), but we can not reduce the size without deleting or omitting data that falls outside the scale to ‘trick’ ggplot.