Monday, November 3, 2014

Graphics for Statistics - figures with ggplot2 - chapter 5 quantile-quantile plots

1 Keen Chapter 5

1.1 Figure 5.1 normal quantile-quantile plot

  • first set up the data vector
  • then we use a point layer with the quantile-quantile stat and change the size of the points: geom_point(stat="qq",shape=1,size=3.5)

require(ggplot2)
require(grid)
mass<-c(5.9,32.0,40.0,51.5,70.0,100.0,78.0,80.0,85.0,85.0,
              110.0,115.0,125.0,130.0,120.0,120.0,130.0,135.0,110.0,130.0,
              150.0,145.0,150.0,170.0,225.0,145.0,188.0,180.0,197.0,218.0,
              300.0,260.0,265.0,250.0,250.0,300.0,320.0,514.0,556.0,840.0,
              685.0,700.0,700.0,690.0,900.0,650.0,820.0,850.0,900.0,1015.0,
              820.0,1100.0,1000.0,1100.0,1000.0,1000.0)
df <- data.frame(y=mass)

ggplot(df,aes(sample=mass)) +
    geom_point(stat="qq",shape=1,size=3.5) +
    scale_x_continuous(limits=c(-3,3),breaks=-3:3,expand=c(0,0.1)) +
    scale_y_continuous(limits=c(0,1200),breaks=seq(0,1200,by=200),expand=c(0,20)) +
    xlab("Standard Normal Quantiles") +
    ylab("Mass (g)") +
          theme(
          panel.background=element_blank(),
          axis.line=element_line(colour="black"),
          axis.text=element_text(colour="black",size=14),
          axis.title=element_text(colour="black",size=14),
          axis.ticks=element_line(colour="black"),
          axis.ticks.length=unit(3,"mm")
          )


1.2 Figure 5.2 normal quantile-quantile plot

  • only use coord_flip() on the previous plot


ggplot(df,aes(sample=mass)) +
    geom_point(stat="qq",shape=1,size=3.5) +
    scale_x_continuous(limits=c(-3,3),breaks=-2:2,expand=c(0,0.5)) +
    scale_y_continuous(limits=c(0,1200),breaks=seq(0,1200,by=200),expand=c(0,20)) +
    xlab("Standard Normal Quantiles") +
    ylab("Mass (g)") +
    coord_flip() +
          theme(
          panel.background=element_blank(),
          axis.line=element_line(colour="black"),
          axis.text=element_text(colour="black",size=14),
          axis.title=element_text(colour="black",size=14),
          axis.ticks=element_line(colour="black"),
          axis.ticks.length=unit(3,"mm")
          )


1.3 Figure 5.3 normal quantile-quantile plot

  • only add geom_abline() with intercept=mean(mass) and slope=sd(mass) (the line goes through the point defined by the mean of both, x and y values which is (mean(standard normal dist)=0,mean(mass)) therefore mean(mass) is the intercept; the sd of the norm is one so the slope is sd(mass)/sd(standard normal dist)=sd(mass))


ggplot(df,aes(sample=mass)) +
    geom_point(stat="qq",shape=1,size=3.5) +
    geom_abline(intercept=mean(mass),slope=sd(mass)) + 
    scale_x_continuous(limits=c(-3,3),breaks=-3:3,expand=c(0,0.1)) +
    scale_y_continuous(limits=c(0,1200),breaks=seq(0,1200,by=200),expand=c(0,20)) +
    xlab("Standard Normal Quantiles") +
    ylab("Mass (g)") +
          theme(
          panel.background=element_blank(),
          axis.line=element_line(colour="black"),
          axis.text=element_text(colour="black",size=14),
          axis.title=element_text(colour="black",size=14),
          axis.ticks=element_line(colour="black"),
          axis.ticks.length=unit(3,"mm")
          )


1.4 Figure 5.4 gamma quantile-quantile plot

  • first we calculate the estimates of the scale and shape parameter of the gamma dist
  • then we only change the dist to qgamma (distribution="qgamma") and set the parameters as list (dparams=list(shape=sshape,scale=sscale))
  • last adjust the x-axis (limits and breaks)

sshape<-(mean(mass)/sd(mass))**2
sscale<-var(mass)/mean(mass)

ggplot(df,aes(sample=mass)) +
    geom_point(stat="qq",distribution="qgamma",dparams=list(shape=sshape,scale=sscale),shape=1,size=3.5) +
    scale_x_continuous(limits=c(0,1500),breaks=seq(0,1500,by=500),expand=c(0,20)) +
    scale_y_continuous(limits=c(0,1200),breaks=seq(0,1200,by=200),expand=c(0,20)) +
    xlab("Gamma Quantiles") +
    ylab("Mass (g)") +
          theme(
          panel.background=element_blank(),
          axis.line=element_line(colour="black"),
          axis.text=element_text(colour="black",size=14),
          axis.title=element_text(colour="black",size=14),
          axis.ticks=element_line(colour="black"),
          axis.ticks.length=unit(3,"mm")
          )


1.5 Figure 5.5 normal quantile-quantile plot

  • we only have to change the x-axis of plot 5.1
  • so set the limits to c(-4,4) and the breaks to the required x-values by using the qnorm() function and change the labels appropriately
  • finally change to title of the x-axis

ggplot(df,aes(sample=mass)) +
    geom_point(stat="qq",shape=1,size=3.5) +
    scale_x_continuous(limits=c(-4,4),
                       breaks=qnorm(c(0.0001,.01,.05,.25,.50,.75,.95,.99,.9999)),
                       labels=c(0.01,1,5,25,50,75,95,99,99.99),
                       expand=c(0,0.1)) +
    scale_y_continuous(limits=c(0,1200),breaks=seq(0,1200,by=200),expand=c(0,20)) +
    xlab("Normal Percentiles") +
    ylab("Mass (g)") +
          theme(
          panel.background=element_blank(),
          axis.line=element_line(colour="black"),
          axis.text=element_text(colour="black",size=14),
          axis.title=element_text(colour="black",size=14),
          axis.ticks=element_line(colour="black"),
          axis.ticks.length=unit(3,"mm")
          )


1.6 Figure 5.6 normal quantile-quantile plot

  • it is the same like plot 5.5 - only add coord_flip()

ggplot(df,aes(sample=mass)) +
    geom_point(stat="qq",shape=1,size=3.5) +
    scale_x_continuous(limits=c(-4,4),
                       breaks=qnorm(c(0.0001,.01,.05,.25,.50,.75,.95,.99,.9999)),
                       labels=c(0.01,1,5,25,50,75,95,99,99.99),
                       expand=c(0,0.1)) +
    scale_y_continuous(limits=c(0,1200),breaks=seq(0,1200,by=200),expand=c(0,20)) +
    xlab("Normal Percentiles") +
    ylab("Mass (g)") +
    coord_flip() +
          theme(
          panel.background=element_blank(),
          axis.line=element_line(colour="black"),
          axis.text=element_text(colour="black",size=14),
          axis.title=element_text(colour="black",size=14),
          axis.ticks=element_line(colour="black"),
          axis.ticks.length=unit(3,"mm")
          )


1.7 Figure 5.7 gamma quantile-quantile plot

  • from figure 5.4 we have to change the axis and flip the coordinates:
  • to calculate the breaks we use qgamma() with the appropriate values and the shape and scale parameter
  • then we change the labels accordingly
  • add coord_flip()

sshape<-(mean(mass)/sd(mass))**2
sscale<-var(mass)/mean(mass)

ggplot(df,aes(sample=mass)) +
    geom_point(stat="qq",distribution="qgamma",dparams=list(shape=sshape,scale=sscale),shape=1,size=3.5) +
    scale_x_continuous(limits=c(0,2400),
                       breaks=qgamma(c(0.01,.25,.50,.75,.95,.99,.999),shape=sshape,scale=sscale),
                       labels=c(1,25,50,75,95,99,99.9),expand=c(0,20)) +
    scale_y_continuous(limits=c(0,1200),breaks=seq(0,1200,by=200),expand=c(0,20)) +
    xlab("Gamma Quantiles") +
    ylab("Mass (g)") +
    coord_flip() +
          theme(
          panel.background=element_blank(),
          axis.line=element_line(colour="black"),
          axis.text=element_text(colour="black",size=14),
          axis.title=element_text(colour="black",size=14),
          axis.ticks=element_line(colour="black"),
          axis.ticks.length=unit(3,"mm")
          )


Sunday, November 2, 2014

Graphics for Statistics - figures with ggplot2 - chapter 4 ecdf plots

1 Keen Chapter 4

Graphics out of the book Graphics for Statistics and Data Analysis with R by Kevin Keen (book home page)

1.1 Figure 4.17 EDF plot

  • first set up the data frame:
    • we use the ecdf() function to get create a stepfunction ecdfmass()
    • from this function we can extract the knots (which will be mapped to the x-axis)
    • using this knots as arguments in ecdfmass() we'll get the belonging probabilities (which we will map to the y-axis)
    • the end column in df contains the end points of the horizontal lines in the step function - it's only the knots vector beginning with the second element and setting the last element to NA

mass<-c(5.9,32.0,40.0,51.5,70.0,100.0,78.0,80.0,85.0,85.0,
        110.0,115.0,125.0,130.0,120.0,120.0,130.0,135.0,110.0,130.0,
        150.0,145.0,150.0,170.0,225.0,145.0,188.0,180.0,197.0,218.0,
        300.0,260.0,265.0,250.0,250.0,300.0,320.0,514.0,556.0,840.0,
        685.0,700.0,700.0,690.0,900.0,650.0,820.0,850.0,900.0,1015.0,
        820.0,1100.0,1000.0,1100.0,1000.0,1000.0)


ecdfmass <- ecdf(mass)
kn <- knots(ecdfmass)
ed <- ecdfmass(kn)

df <- data.frame(knots=kn,ed=ed,end=c(kn[-1],NA))
head(df)

  knots         ed  end
1   5.9 0.01785714 32.0
2  32.0 0.03571429 40.0
3  40.0 0.05357143 51.5
4  51.5 0.07142857 70.0
5  70.0 0.08928571 78.0
6  78.0 0.10714286 80.0

  • now we first set the aesthetics for the points: x to knots and y to ed
  • add the point layer and setting the point size to 3: geom_point(size=3)
  • add the lines using a segment layer setting the aesthetics xend and yend: geom_segment(aes(xend=end,yend=ed))
  • in the next step we add two addition segment layers, one for each arrow; you can also use annotate() to do this; inside these segments we use the arrow() function from the grid package, so we can define the appearance of our arrows
  • the next two lines change the appearance of the axes: setting the limits, the breaks and the expansion
  • then set the appropriate axes titles and customize axis elements and panel.background



require(grid) ## for the arrow() function
ggplot(df,aes(x=knots,y=ed)) +
    geom_point(size=3) +
    geom_segment(aes(xend=end,yend=ed)) +
    geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
    scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.2),expand=c(0,0)) +
    xlab("Mass (g)") +
    ylab("Empirical Distribution Function") +
    theme(
        panel.background=element_blank(),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks=element_line(colour="black"),
        axis.ticks.length=unit(3,"mm")
        )


1.2 Figure 4.18 EDF plot

  • we only have to change the axis breaks of the y-axis and add the horizontal lines
  • the first is done changing the by argument in the seq() to 0.25 in scale_y_continuous()
  • then we add a hline layer


ggplot(df,aes(x=knots,y=ed)) +
    geom_point(size=3) +
    geom_segment(aes(xend=end,yend=ed)) +
    geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_hline(yintercept=c(0.25,0.5,0.75),linetype=2) +
    scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
    scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.25),expand=c(0,0)) +
    xlab("Mass (g)") +
    ylab("Empirical Distribution Function") +
    theme(
        panel.background=element_blank(),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks=element_line(colour="black"),
        axis.ticks.length=unit(3,"mm")
        )



  • we also can use the grid lines (but then we have also lines at 0 and 1


ggplot(df,aes(x=knots,y=ed)) +
    geom_point(size=3) +
    geom_segment(aes(xend=end,yend=ed)) +
    geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
    scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.25),expand=c(0,0)) +
    xlab("Mass (g)") +
    ylab("Empirical Distribution Function") +
    theme(
        panel.background=element_blank(),
        panel.grid.major.x=element_blank(),
        panel.grid.major.y=element_line(linetype = 2,colour="grey50"),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks=element_line(colour="black"),
        axis.ticks.length=unit(3,"mm")
        )


1.3 Figure 4.19 EDF plot

  • replace geom_point() by geom_step()
  • get rid of the horizontal lines
  • add another little segment which connects the left arrow with the step function: geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df$ed),size=0.4)
  • leave everything as it is


ggplot(df,aes(x=knots,y=ed)) +
    geom_step(direction = "hv") +
    geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df$ed),size=0.4) +
    scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
    scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.2),expand=c(0,0)) +
    xlab("Mass (g)") +
    ylab("Empirical Distribution Function") +
    theme(
        panel.background=element_blank(),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks=element_line(colour="black"),
        axis.ticks.length=unit(3,"mm")
        )



  • instead of using the data frame created above you can use the original data (mass) and set stat to ecdf
  • but for the arrows you need to calculate the vals anyway, that's why I use the data frame above


df3 <- data.frame(mass=mass)
ggplot(df,aes(x=knots,y=ed)) +
    geom_step(inherit.aes=F,stat="ecdf",data=df3,aes(x=mass)) +
    geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df$ed),size=0.4) +
    scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
    scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.2),expand=c(0,0)) +
    xlab("Mass (g)") +
    ylab("Empirical Distribution Function") +
    theme(
        panel.background=element_blank(),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks=element_line(colour="black"),
        axis.ticks.length=unit(3,"mm")
        )



1.4 Figure 4.20 EDF plot

  • the last plot only with the horizontal grid lines add the quartiles


ggplot(df,aes(x=knots,y=ed)) +
    geom_step(direction = "hv") +
    geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df$ed),size=0.4) +
    geom_hline(yintercept=c(0.25,0.5,0.75),linetype=2) +
    scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
    scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.25),expand=c(0,0)) +
    xlab("Mass (g)") +
    ylab("Empirical Distribution Function") +
    theme(
        panel.background=element_blank(),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks=element_line(colour="black"),
        axis.ticks.length=unit(3,"mm")
        )



1.5 Figure 4.20 EDF plot with cumulative normal distribution function added

  • first we need to create a second data frame containing the values defining the curve, we choose to use 10000 points on the x-axis and use pnorm() to calculate to respective y values (using the empirical mean and the empirical sd of the vector mass)
  • then we add the layer (geom_line())
  • change the limits of the x-axis and the breaks inside scale_x_continuous()
  • change the length of the right arrow (setting xend to 1500)


mean_mass<-mean(mass)
sd_mass<-sd(mass)
min_mass<-min(mass)
max_mass<-1500

xx <- seq(0,10000,1)*(max_mass-min_mass)/10000.+min_mass
yy <- pnorm(xx,mean_mass,sd_mass)

df2 <- data.frame(xx=xx,yy=yy)

ggplot(df,aes(x=knots,y=ed)) +
    geom_step(direction = "hv") +
    geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=max(df$knots),xend=1500,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
    geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df$ed),size=0.4) +
    geom_line(data=df2,aes(x=xx,y=yy)) +
    scale_x_continuous(limits=c(-45,1500),breaks=seq(0,1500,by=500),expand=c(0,0)) +
    scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.2),expand=c(0,0)) +
    xlab("Mass (g)") +
    ylab("Empirical Distribution Function") +
    theme(
        panel.background=element_blank(),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks=element_line(colour="black"),
        axis.ticks.length=unit(3,"mm")
        )


Graphics for Statistics - figures with ggplot2 - Chapter 3 part 2 Bar charts with errorbars, dot-whisker charts

1 Keen Chapter 3 part 2

Graphics out of the book Graphics for Statistics and Data Analysis with R by Kevin Keen (book home page)

1.1 Figure 3.8 Bar-whisker chart

  • we will work out the graphic on flipped axes and flip them later
  • set the aesthetics:
    • x to the names of the allergenes
    • y to the percentage (prevalence)
    • ymin to y minus the standard error and
    • ymax to y plus the standard error
  • using again geom_bar() with the stat="identity" option because we have already aggregated data (so the height of the bars is set by the y aesthetic)
  • set the filling and the colour of the edges (fill and colour), finally adjust the width (width) of the bars
  • there is no axis title on the axis with the names, so set xlab("") (remember we will flip the axis later)
  • set the title of the continuous axis to Percent and
  • set the limits of the axis to 0 and 50
  • set expansion of it to c(0,0) - because we did not want to expand the axis, it should actually end at 0 and 50
  • now we flip the axes
  • and set the appearance of the text, axis and background elements

require(ggplot2)

names<-c("Epidermals","Dust Mites","Weeds","Grasses","Molds","Trees")
prevs<-c(38.2,37.8,31.1,31.1,29.3,26.7)
se<-c(3.2,3.2,3.1,3.1,3.0,2.9)

df <- data.frame(item=factor(1:6,labels=names),prevs=prevs,se=se)

ggplot(df,aes(x=item,y=prevs,ymin=prevs-se,ymax=prevs+se)) +
    geom_bar(stat="identity",fill="transparent",colour="black",width=0.7) +
    geom_errorbar(width=0.3) +
    xlab("") +
    ylab("Percent") +
    scale_y_continuous(limits=c(0,50),expand=c(0,0)) +
    coord_flip() +
    theme(
        panel.background=element_blank(),
        axis.line=element_line(colour="black"),
        axis.line.y=element_blank(),
        axis.text=element_text(colour="black",size=14),
        axis.ticks.x=element_line(colour="black"),
        axis.ticks.y=element_blank()
        )


1.2 Figure 3.9 Bar and single whisker chart

  • the same as above, only change the filling of the bars to black (you do not need the colour argument any more) and the width of the error bars to 0

ggplot(df,aes(x=item,y=prevs,ymin=prevs-se,ymax=prevs+se)) +
    geom_bar(stat="identity",fill="black",width=0.7) +
    geom_errorbar(width=0) +
    xlab("") +
    ylab("Percent") +
    scale_y_continuous(limits=c(0,50),expand=c(0,0)) +
    coord_flip() +
    theme(
        panel.background=element_blank(),
        axis.line=element_line(colour="black"),
        axis.line.y=element_blank(),
        axis.text=element_text(colour="black",size=14),
        axis.ticks.x=element_line(colour="black"),
        axis.ticks.y=element_blank()
        )


1.3 Figure 3.10 Dot-whisker chart

  • for the dot-whisker chart we replace geom_bar() by geom_point()
  • in geom_point() we set the point size to four and the colour to black
  • in geom_errorbar() we set the the width to 0.3
  • add geom_vline() for the dotted lines and set the aesthetics xintercept to as.numeric(item) (because this aesthetic expects a numeric argument)
  • then change some elements in the theme section
    • set the colour in panel.border to black and do not forget to set fill to NA (you won't see anything if you don't)
    • remove the axis.line.y line


ggplot(df,aes(x=item,y=prevs,ymin=prevs-se,ymax=prevs+se)) +
    geom_point(colour="black",size=4) +
    geom_errorbar(width=0.25) +
    geom_vline(aes(xintercept=as.numeric(item)),linetype=3,size=0.4) +
    xlab("") +
    ylab("Percent") +
    scale_y_continuous(limits=c(0,50),expand=c(0,0)) +
    coord_flip() +
    theme(
        panel.background=element_blank(),
        panel.border=element_rect(colour="black",fill=NA),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks.x=element_line(colour="black"),
        axis.ticks.y=element_blank()
        )


1.4 Figure 3.11 Dot-whisker chart

  • only minor changes to the previous plot
  • remove geom_vline()
  • adjust the widths of the error bars


ggplot(df,aes(x=item,y=prevs,ymin=prevs-se,ymax=prevs+se)) +
    geom_point(colour="black",size=4) +
    geom_errorbar(width=0.1) +
    xlab("") +
    ylab("Percent") +
    scale_y_continuous(limits=c(0,50),expand=c(0,0)) +
    coord_flip() +
    theme(
        panel.background=element_blank(),
        panel.border=element_rect(colour="black",fill=NA),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks.x=element_line(colour="black"),
        axis.ticks.y=element_blank()
        )


1.5 Figure 3.12 Dot-whisker chart

  • only adjust the widths of the error bars


ggplot(df,aes(x=item,y=prevs,ymin=prevs-se,ymax=prevs+se)) +
    geom_point(colour="black",size=4) +
    geom_errorbar(width=0) +
    xlab("") +
    ylab("Percent") +
    scale_y_continuous(limits=c(0,50),expand=c(0,0)) +
    coord_flip() +
    theme(
        panel.background=element_blank(),
        panel.border=element_rect(colour="black",fill=NA),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks.x=element_line(colour="black"),
        axis.ticks.y=element_blank()
        )


1.6 Figure 3.13 two-tiered dot-whisker chart

  • there are several possibilities
  • I decided to use two error bar layers so first
  • I have to move the aesthetics for ymin and ymax to geom_errorbar(), I set the width to 0.2
  • then I add a second geom_errorbar() set there also aesthetics but now ymin to prevs-1.96*se and ymax to prevs+1.96*se


ggplot(df,aes(x=item,y=prevs)) +
    geom_point(colour="black",size=4) +
    geom_errorbar(aes(ymin=prevs-se,ymax=prevs+se),width=0.2) +
    geom_errorbar(aes(ymin=prevs-1.96*se,ymax=prevs+1.96*se),width=0) +
    xlab("") +
    ylab("Percent") +
    scale_y_continuous(limits=c(0,50),expand=c(0,0)) +
    coord_flip() +
    theme(
        panel.background=element_blank(),
        panel.border=element_rect(colour="black",fill=NA),
        axis.line=element_line(colour="black"),
        axis.text=element_text(colour="black",size=14),
        axis.title=element_text(colour="black",size=14),
        axis.ticks.x=element_line(colour="black"),
        axis.ticks.y=element_blank()
        )


Friday, October 31, 2014

R - plot with two different y-axes on top of each other

1 two plots on top of each other

  • first create to vectors of y-values
  • then set the margins appropriately
  • then we plot the first series of y-values and plot the axis title

y.plot1 <- sort(runif(10))
y.plot2 <- runif(10)*5
par(mar=c(5,6,2,4))
plot(y.plot1,lwd = 4,ann=F,las=2,type="l")
mtext("y.plot1",side = 2,line=3.5)



  • now we set the graphics parameter new to TRUE ; this causes R not to clean the plot before the next high level graphic command is executed
  • then we plot the second plot without annotations (because we want the axis on the right side, which would be plotted on the left side by default)
  • then we plot the y-axis on the right side of the plot (the four in axis(4, ...) indicates the side) and add the annotations

y.plot1 <- sort(runif(10))
par(new=T)
plot(y.plot2,ann=F,axes = F,col="red",type="l",lwd = 4)
axis(4,col="red",col.ticks = "red",lwd=3,at=0:4)
mtext("y.plot2",side = 4,line=3,col="red")


R - get weather data from openweather api

1 get weather data an from openweather via R

  • load the rjson package which is able to parse json into R formats
  • set the file argument to the uri
  • within the uri you can specify the city or a weather station (see http://openweathermap.org/api for details), here are a example by station

require(rjson)
tt <- fromJSON(file=
  "http://api.openweathermap.org/data/2.5/history/station?id=7447&type=day")

  • the fromJSON() command converts the JSON formatted data into a list
  • the list element named list contains the weather data for one month previous in the free account
  • the little function extract.temp() below, extracts the min and max temperature and the wind speed (together with the time)

extract.temp <- function(x){
    return(data.frame(min=x$temp$mi-273.15,
                      max=x$temp$ma-273.15,
                      wind=unlist(x$wind$speed[1]),
                      tstmp=x$dt))
}

temps <- Reduce(rbind,lapply(tt$list , extract.temp))
temps$tstmp <- as.Date(as.POSIXct(temps$tstmp,
                       origin = as.Date("1970-01-01")))
head(temps)

   min max wind      tstmp
v   16  26 1.42 2014-10-05
v1  12  23 2.31 2014-10-06
v2  14  24 2.12 2014-10-07
v3  15  25 3.14 2014-10-08
v4  15  25 3.33 2014-10-09
v5  16  25 3.14 2014-10-10

  • now we plot the data using the symbols() command, plotting wind speed on the y-axis and time on the x-axis (there is some mix-up because of the the beginning of winter time (from daylight saving time) on oct, 26th


nod <- length(tt$list)

symbols(x=temps$tstmp,
        y=temps$wind,
        thermometers = cbind(.4,1.6,temps$min/40,temps$max/40),
        fg = "red",
        ylim = c(-0.5,max(temps$wind)+1),
        xaxt="n",
        xlab = "day", ylab="wind speed",
        main="Temperature of the last 30 days")
axis(1,at=temps$tstmp,labels=
                substr(as.character(temps$tstmp),6,10),las=2)

 text(x=temps$tstmp,
      y=temps$wind-0.5,rep(0,nod),adj=c(0.5,0))
 text(x=temps$tstmp,
      y=temps$wind+0.5,rep(40,nod),adj=c(0.5,0))
 text(x=temps$tstmp,
      y=temps$wind,rep(20,nod),adj=c(0.5,0.5))




  • with the little function below you can find the station ids by lat and lon


get.station <- function(lat,lon){
    tmpdat <- fromJSON(file=
     paste0("http://api.openweathermap.org/data/2.5/station/find?lat=",lat,
            "&lon=",lon,"&cnt=1"))
    print(paste("found:",tmpdat[[1]]$station$name,", id:",tmpdat[[1]]$station$id))
    tmpdat[[1]]$station$id
}

station <- get.station(lat=34,lon=118.3)
station

[1] "found: ZSNJ , id: 7447"
[1] 7447



get.station(51.5,0.1)

[1] "found: EGLC , id: 5091"
[1] 5091

Sunday, October 12, 2014

R - ROC curves

Table of Contents

1 ROCR

  • load the package
  • the artificial data set ROCR.simple contains measurements and labels which you can consider as gold standard
  • the prediction() function takes the measurements and the gold standard as input
  • the value returned is a list containing inter alia:
    • measurements and labels
    • vectors of cutoffs and the number of
    • false positives
    • true positives
    • false negatives
    • true negatives corresponding to the resp. cutoffs

require(ROCR,quietly=T)
data(ROCR.simple)
pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels)
str(pred)

..$ : num [1:200] 0.613 0.364 0.432 0.14 0.385 ...
 ..@ labels     :List of 1
 .. ..$ : Ord.factor w/ 2 levels "0"<"1": 2 2 1 1 1 2 2 2 2 1 ...
 ..@ cutoffs    :List of 1
 .. ..$ : num [1:201] Inf 0.991 0.985 0.985 0.983 ...
 ..@ fp         :List of 1
 .. ..$ : num [1:201] 0 0 0 0 1 1 2 3 3 3 ...
 ..@ tp         :List of 1
 .. ..$ : num [1:201] 0 1 2 3 3 4 4 4 5 6 ...
 ..@ tn         :List of 1
 .. ..$ : num [1:201] 107 107 107 107 106 106 105 104 104 104 ...
 ..@ fn         :List of 1
 .. ..$ : num [1:201] 93 92 91 90 90 89 89 89 88 87 ...
 ..@ n.pos      :List of 1
 .. ..$ : int 93
 ..@ n.neg      :List of 1
 .. ..$ : int 107
 ..@ n.pos.pred :List of 1
 .. ..$ : num [1:201] 0 1 2 3 4 5 6 7 8 9 ...
 ..@ n.neg.pred :List of 1
 .. ..$ : num [1:201] 200 199 198 197 196 195 194 193 192 191 ...

  • now we can use performance() to calculate different performance measures
  • first we plot the classic ROC Curve based on sensitivity and specificity:
    • plotting sensitivity (y-axis) against 1-specificity (x-axis) which is equivalent to
    • true positive rate against false positive rate which is equivalent to

perf <- performance(pred,"tpr","fpr")
plot(perf)



  • calculate the area under the curve (AUC) (the output is a bit messy, but you find the value of interest in the slot y.values

print(performance(pred,"auc"))

An object of class "performance"
Slot "x.name":
[1] "None"

Slot "y.name":
[1] "Area under the ROC curve"

Slot "alpha.name":
[1] "none"

Slot "x.values":
list()

Slot "y.values":
[[1]]
[1] 0.8341875


Slot "alpha.values":
list()

  • find the best cutoff (best always depends on your preferences); here we put equal weight on sensitivity and specificity and maximize the sum of them (Youden Index)
  • we write a function which takes a prediction object, the names of two performance measurements and gives back the cutoff, the maximum of the sum and the respective values of the two performance measurements

max.ss <- function(pred,meas.1,meas.2){
    meas.1 <- performance(pred,meas.1)
    meas.2 <- performance(pred,meas.2)
    x.vals <- slot(meas.1,'x.values')[[1]]
    y.vals <- slot(meas.1,'y.values')[[1]] + slot(meas.2,'y.values')[[1]]
    y1.vals <- slot(meas.1,'y.values')[[1]]
    y2.vals <- slot(meas.2,'y.values')[[1]]
    list(cut.off=x.vals[which.max(y.vals)],
                 max.sum=max(y.vals),
                 max.meas1=y1.vals[which.max(y.vals)],
                 max.meas2=y2.vals[which.max(y.vals)])
}

max.ss(pred,"sens","spec")

$cut.off
[1] 0.5014893

$max.sum
[1] 1.69993

$max.meas1
[1] 0.8494624

$max.meas2
[1] 0.8504673

  • here we get a cutoff of 0.5
  • the maximized sum is 1.70
  • the resp. sensitivity is 0.85
  • the resp. specificity is also 0.85
  • sometimes we have a clear preference because the cost of a false negative is much higher than the cost of a false positive (or vice versa)
  • therefore it exists a modified version of the Youden-Index which maximizes \(sensitivity + r\cdot specificity\) where \(r=\frac{1-prevalence}{cost\cdot prevalence}\) and \(cost\) is the cost of a false negative and \(prevalence\) is the prevalence in the population under consideration


max.ss <- function(pred,cost=1,prev=0.5){
    r <- (1-prev)/(cost*prev)
    sens <- performance(pred,"sens")
    spec <- performance(pred,"spec")
    x.vals <- slot(sens,'x.values')[[1]]
    y.vals <- slot(sens,'y.values')[[1]] + r*slot(spec,'y.values')[[1]]
    y1.vals <- slot(sens,'y.values')[[1]]
    y2.vals <- slot(spec,'y.values')[[1]]
    list(cut.off=x.vals[which.max(y.vals)],
                 sensitivity=y1.vals[which.max(y.vals)],
                 specificity=y2.vals[which.max(y.vals)])
}

max.ss(pred)

$cut.off
[1] 0.5014893

$sensitivity
[1] 0.8494624

$specificity
[1] 0.8504673

  • with the defaults cost=1 and prev=0.5 we get exactly the same result (because \(r=1\) in this case)
  • if we have a disease with prevalence of 0.1 where false negatives (i.e. not detecting a true case) are more expensive


max.ss(pred,cost=20,prev=0.1)

$cut.off
[1] 0.5014893

$sensitivity
[1] 0.8494624

$specificity
[1] 0.8504673

Wednesday, October 1, 2014

R - confidence intervals for odds ratio in matched pair

  • the confidence interval for odds ratio in matched pairs is B/C * exp(+-1.96*sqrt(1/B + 1/C)) where B and C are the counts of discordant pairs
  • here is a example with anemia in mothers ~ low birth weight


bw <- as.table(matrix(c(13,27,8,22),dimnames=list(c("anemia","no anemia"),c("low bw","normal bw")),byrow=T,nrow=2))
bw

low bw normal bw
anemia        13        27
no anemia      8        22

  • do the calculations per hand:


print(paste("odds ratio:",27/8))
27/8 * exp(c(-1,1)*1.96*sqrt(1/27 + 1/8))

[1] "odds ratio: 3.375"
[1] 1.533297 7.428844

  • or use the function from the PropCIs package


require(PropCIs,quietly=T)
oddsratioci.mp(8,27,0.95)

data:  

95 percent confidence interval:
 1.562965 7.287833

Tuesday, September 30, 2014

R - confidence intervals for risk ratio, odds ratio, attributable risk (resp. absolute risk reduction), cond. MLE odds ratio

1 R - confidence intervals for risk ratio, odds ratio, attributable risk (resp. absolute risk reduction), cond. MLE odds ratio

  • generate the data, and table them

obese <- sample(c(T,F),size=1000,replace=T)
mi <- factor((obese + sample(c(-1,0,1),prob = c(0.3,0.35,0.35),size=1000,replace=T)) > 0,labels=c("myocardinfarct","non"))
obese <- factor(obese,labels=c("obese","non-obese"))

table(obese,mi)

mi
obese       myocardinfarct non
  obese                326 187
  non-obese            144 343

  • use the twoby2 function to get the ratios incl. the confidence intervals
  • the risk difference (resp. attributable risk (AR), absolute risk reduction (ARR)) is also calculated

require(Epi)
twoby2(obese,mi)

2 by 2 table analysis: 
------------------------------------------------------ 
Outcome   : myocardinfarct 
Comparing : obese vs. non-obese 

          myocardinfarct non    P(myocardinfarct) 95% conf. interval
obese                326 187               0.6355    0.5929   0.6760
non-obese            144 343               0.2957    0.2568   0.3378

                                   95% conf. interval
             Relative Risk: 2.1491    1.8462   2.5018
         Sample Odds Ratio: 4.1525    3.1859   5.4122
Conditional MLE Odds Ratio: 4.1462    3.1584   5.4617
    Probability difference: 0.3398    0.2800   0.3959

             Exact P-value: 0 
        Asymptotic P-value: 0 
------------------------------------------------------

  • so we got a risk of myocard. infarction of 0.6507 for obese and 0.3006 for non-obese
  • dividing the first by the second (0.6507/0.3006) gives the risk ratio
  • the odds are not given for the groups, calculated by hand it would be

(326/187)/(144/343)

[1] 4.152481

  • which you see in the output of twoby2() (in addition you find the conditional MLE odds)
  • the probability difference (as stated above also known as attributable risk (AR) and absolute risk reduction (ARR)) is simply the difference between the two risks
confidence-intervals-for-odds-ratio paired case

Saturday, September 27, 2014

Flow Charts with R and the Gmisc package

1 flow charts

1.1 with the Gmisc package

  • first create a transition matrix tm, i.e. the rows represent start state the columns represent end state, so tm[1,1] is the number of subjects state in state 1 and end in state 1, so tm[1,3] is the number of subjects state in state 1 and end in state 3


tm <- matrix(NA,nrow=3,ncol=3)
tm[1,] <- 200*c(.5,.25,.25)
tm[2,] <- 800*c(.75,.1,.15)
tm[3,] <- 600*c(0,.2,.8)

tm

     [,1] [,2] [,3]
[1,]  100   50   50
[2,]  600   80  120
[3,]    0  120  480

  • a more convenient way is to tabulate start state and end state


xx <- data.frame(id=1:1000,
                 start.state=sample(1:4,size=1000,replace=T,prob=c(0.2,0.5,0.1,0.2)),
                 end.state=sample(1:4,size=1000,replace=T,prob=c(0.1,0.3,0.2,0.4)))

tm2 <- table(xx$start.state,xx$end.state)
tm2

 
    1   2   3   4
1  24  67  37  89
2  45 143  87 196
3  13  30  24  44
4  25  59  39  78

  • than a simple call to transitionPlot() does the magic
  • box_txt provides the states' names

plot.new() ## call plot.new because transitionPlot does not
transitionPlot(tm,box_txt = 1:3)






plot.new() ## call plot.new because transitionPlot does not
transitionPlot(tm2,box_txt = paste("state",1:4))




  • of course you can change the appearance of the plot, e.g. color of the boxes, appearance of the arrows
  • in a similar way text color can be changed


plot.new() ## call plot.new because transitionPlot does not

transitionPlot(tm2,box_txt = c(paste("state",1:4)),
               type_of_arrow="gradient",
               fill_start_box=c("midnightblue","darkred","darkgreen","maroon4"),
               fill_end_box=c("midnightblue","darkred","darkgreen","maroon4"))



R - create an Excel file from data frame and color the cells given by some index

1.1 create an Excel file from data frame and color the cells given by some index

  • data the following function takes an data.frame containing the data as argument
  • the second argument is also a data frame containing the indices of the cells which should be coloured, row and column are assumed the be the names of these columns (can be changed through the resp arguments)
  • the filename is set through filename (default: date+excelfile.xlsx) and
  • the fill color through color (default: red - for a list of colors type XLC after the XLConnect package is loaded)


require(XLConnect)
create.excel <- function(data,index,color=53,filename=NULL,row="row",column="column"){
    names(index)[names(index)==row] <- "row"
    names(index)[names(index)==column] <- "column"
    if(is.null(filename))filename <- paste0(gsub("-","",as.character(Sys.Date())),"excelfile.xlsx")
    wb <- loadWorkbook(filename,create=T)
    createSheet(wb,"conspvalues")
    writeWorksheet(wb,data,"conspvalues")
    style <- createCellStyle(wb,name="style")
    setFillPattern(style,fill=XLC$"FILL.SOLID_FOREGROUND")
    setFillForegroundColor(style,color=10)
    setCellStyle(wb,sheet = "conspvalues",
                         row=index$row + 1,
                         col=index$column,
                         cellstyle = style)
    return(saveWorkbook(wb))
}

## example
cells <- data.frame(x=sample(nrow(mtcars)),y=sample(ncol(mtcars),size=nrow(mtcars),replace = T))
create.excel(mtcars,cells,row="x",column="y")


Friday, August 8, 2014

transform openstreetmap file to shape file

  • I downloaded the openstreetmap of Saxony from geofabrik
  • I saved the file sachsen-latest.osm.pbf
  • use ogrinfo to show the layers

ogrinfo sachsen-latest.osm.pbf
: Had to open data source read-only.
: INFO: Open of `sachsen-latest.osm.pbf'
:       using driver `OSM' successful.
: 1: points (Point)
: 2: lines (Line String)
: 3: multilinestrings (Multi Line String)
: 4: multipolygons (Multi Polygon)
: 5: other_relations (Geometry Collection)

  • then use ogr2ogr to transform a layer to a shapefile
  • additional I used the spat argument for only including the Leipzig region

ogr2ogr -spat 12.2 51.2 12.6 51.45 -f "ESRI Shapefile" sachsen2.osm.pbf sachsen-latest.osm.pbf multipolygons

Author: Mandy Vogel

Created: 2014-08-08 Fr 09:25

Emacs 24.3.1 (Org mode 8.2.5h)

Validate

Wednesday, August 6, 2014

Filter SpatialPolygonsDataFrame

  • by id
> subset(sp.data.frame, id %in% myIDs)
  • by coordinates
> range <- cbind(c(long1,long2), c(lat1,lat2))
> centroids <- coordinates(sp.data.frame)
> spdf.subset <- sp.data.frame[centroids[,1] > range[1,1] &
                    centroids[,1] < range[2,1] &
                    centroids[,2] > range[1,2] &
                    centroids[,2] < range[2,2],]

Sunday, June 15, 2014