# an illustration of simpson's paradox # Kyle Gorman # first off, a bit of python code I used to create the data # ##!/usr/bin/env python #from random import uniform, gauss #fid = open('sim.csv', 'w') #fid.write('yield,temperature,country,group\n') # #for i in xrange(15): # countries # for j in xrange(100): # x = uniform(7, 10) # fid.write("%f,%f,%d,a\n" % (x, 2 * x + gauss(0, 3) + 5, i)) # #for i in xrange(15, 18): # for j in xrange(100): # x = uniform(4, 8) # fid.write("%f,%f,%d,b\n" % (x, 2 * x + gauss(0, 5) + 20, i)) # #for i in xrange(18, 20): # for j in xrange(100): # x = uniform(2, 4) # fid.write("%f,%f,%d,c\n" % (x, 2 * x + gauss(0, 5) + 25, i)) # #for i in xrange(21, 25): # for j in xrange(100): # x = uniform(0, 2) # fid.write("%f,%f,%d,d\n" % (x, 2 * x + gauss(0, 5) + 30, i)) # # so run this: it'll produce "sim.csv" # libraries library(arm) library(car) library(languageR) dat = read.csv('sim.csv', header=1) png('scatter.png') plot(yield ~ temperature, data=dat, main='temperature and crop yield (simulation)', xlab='temperature (celcius)') dev.off() png('regline.png') plot(yield ~ temperature, data=dat, main='temperature and crop yield (simulation)', xlab='temperature (celcius)') m0 = lm(yield ~ temperature, data=dat) summary(m0) reg.line(m0, lwd=3) dev.off() png('groupline.png') plot(yield ~ temperature, data=subset(dat, dat$group == 'a'), main='temperature and crop yield (simulation)', xlab='temperature (celcius)', col=3, xlim=range(dat$temperature), ylim=range(dat$yield), pch=21) points(yield ~ temperature, data=subset(dat, dat$group == 'b'), col=2, pch=22) points(yield ~ temperature, data=subset(dat, dat$group == 'c'), col=4, pch=23) points(yield ~ temperature, data=subset(dat, dat$group == 'd'), col=6, pch=24) m1 = lmer(yield ~ temperature + (1 | group), data=dat) summary(m1) abline(fixef(m1)[1], fixef(m1)[2], lwd=3, col=1) # main line rans = ranef(m1)$group$`(Intercept)` abline(fixef(m1)[1] + rans[1], fixef(m1)[2], lwd=2, lty=3) # ranline abline(fixef(m1)[1] + rans[2], fixef(m1)[2], lwd=2, lty=3) abline(fixef(m1)[1] + rans[3], fixef(m1)[2], lwd=2, lty=3) abline(fixef(m1)[1] + rans[4], fixef(m1)[2], lwd=2, lty=3) #pvals.fnc(m1) # this make take a while, but is worth looking at dev.off()