Posts Tagged ‘label’

Boxplot with outliers label & sample size

Thursday, February 3rd, 2011

Una herramienta útil para hacer un boxplot e identificar los outliers. La función del boxplot es la siguiente:

boxplot.with.outlier.label <- function(y, label_name, ..., spread_text = T, data, plot = T, range = 1.5, label.col = "blue", 
										jitter_if_duplicate = T, jitter_only_positive_duplicates = F)
{	
	# jitter_if_duplicate - will jitter (Actually just add a bit of numbers) so to be able to decide on which location to plot the label when having identical variables...
	require(plyr) # for is.formula and ddply
 
	# a function to jitter data in case of ties in Y's
	jitter.duplicate <- function(x, only_positive = F)
	{
		if(only_positive) {
			ss <- x > 0
		} else {
			ss <- T
		}	
		ss_dup <- duplicated(x[ss])
		# ss <- ss & ss_dup
		temp_length <- length(x[ss][ss_dup])	
		x[ss][ss_dup] <- x[ss][ss_dup] + seq(from = 0.00001, to = 0.00002, length.out = temp_length)
		x
	}
	# jitter.duplicate(c(1:5))
	# jitter.duplicate(c(1:5,5,2))
	# duplicated(jitter.duplicate(c(1:5,5,2)))
	# jitter.duplicate(c(0,0,1:5,5,2))
	# duplicated(jitter.duplicate(c(0,0,1:5,5,2)))
 
 
 
	# handle cases where 
	if(jitter_if_duplicate) {
		# warning("duplicate jutter of values in y is ON")
		if(!missing(data)) {	#e.g: we DO have data
			# if(exists("y") && is.formula(y)) {		# F && NULL # F & NULL
			y_name <- as.character(substitute(y))	# I could have also used as.list(match.call())
												# credit to Uwe Ligges and Marc Schwartz for the help
												# https://mail.google.com/mail/?shva=1#inbox/12dd7ca2f9bfbc39
			if(length(y_name) > 1) {	# then it is a formula (for example: "~", "y", "x"
				model_frame_y <- model.frame(y, data = data)
				temp_y <- model_frame_y[,1]
				temp_y  <- jitter.duplicate(temp_y, jitter_only_positive_duplicates)	# notice that the default of the function is to work only with positive values...
				# the_txt <- paste(names(model_frame_y)[1], "temp_y", sep = "<<-") # wrong...
				the_txt <- paste("data['",names(model_frame_y)[1],"'] <- temp_y", sep = "")				
				eval(parse(text = the_txt))	# jutter out y var so to be able to handle identical values.
			} else {	# this isn't a formula
				data[,y_name] <- jitter.duplicate(data[,y_name], jitter_only_positive_duplicates)
				y <- data[,y_name]	# this will make it possible for boxplot(y, data) to work later (since it is not supposed to work with data when it's not a formula, but now it does :))
			}		
		} else {	# there is no "data"		 
			if(is.formula(y)) { # if(exists("y") && is.formula(y)) {		# F && NULL # F & NULL
				temp_y <- model.frame(y)[,1]
				temp_y  <- jitter.duplicate(temp_y, jitter_only_positive_duplicates)	# notice that the default of the function is to work only with positive values...
				environment(y) <- new.env()
				assign(names(model.frame(y))[1], temp_y, environment(y))
					# Credit and thanks for doing this goes to Niels Richard Hansen (2 Jan 30, 2011)
					# http://r.789695.n4.nabble.com/environment-question-changing-variables-from-a-formula-through-model-frame-td3246608.html
				# warning("Your original variable (in the global environemnt) was just jittered.")	# maybe I should add a user input before doing this....
				# the_txt <- paste(names(model_frame_y)[1], "temp_y", sep = "<<-")
				# eval(parse(text = the_txt))	# jutter out y var so to be able to handle identical values.
			} else {
				y <- jitter.duplicate(y, jitter_only_positive_duplicates)
			}		
		}
	}
	# the_txt <- paste("print(",names(model_frame_y)[1], ")")
	# eval(parse(text = the_txt))	# jutter out y var so to be able to handle identical values.
	# print(ls())
 
 
	# y should be a formula of the type: y~x, y~a*b
	# or it could be simply y
	if(missing(data)) {
			boxdata <- boxplot(y, plot = plot,range = range ,...)
		} else {
			boxdata <- boxplot(y, plot = plot,data = data, range = range ,...)
		}
	if(length(boxdata$names) == 1 && boxdata$names =="") boxdata$names <- 1	# this is for cases of type: boxplot(y) (when there is no dependent group)
	if(length(boxdata$out) == 0 ) stop("No outliers detected for this boxplot")
 
	if(!missing(data)) attach(data)	# this might lead to problams I should check out for alternatives for using attach here...
 
 
	# creating a data.frame with information from the boxplot output about the outliers (location and group)
	boxdata_group_name <- factor(boxdata$group)
	levels(boxdata_group_name) <- boxdata$names[as.numeric(levels(boxdata_group_name))]	# the subseting is for cases where we have some sub groups with no outliers
	boxdata_outlier_df <- data.frame(group = boxdata_group_name, y = boxdata$out, x = boxdata$group)
 
 
	# Let's extract the x,y variables from the formula:
	if(is.formula(y))
	{
		model_frame_y <- model.frame(y)
		y <- model_frame_y[,1]
		x <- model_frame_y[,-1]
		if(!is.null(dim(x))) {	# then x is a matrix/data.frame of the type x1*x2*..and so on - and we should merge all the variations...
			x <- apply(x,1, paste, collapse = ".")
		}
	} else {
		# if(missing(x)) x <- rep(1, length(y))
		x <- rep(1, length(y))	# we do this in case y comes as a vector and without x
	}	
 
	# and put all the variables (x, y, and outlier label name) into one data.frame
	DATA <- data.frame(label_name, x ,y)
	if(!missing(data)) detach(data)	# we don't need to have "data" attached anymore.
 
	# let's only keep the rows with our outliers 
	boxplot.outlier.data <- function(xx, y_name = "y")
	{
		y <- xx[,y_name]
		boxplot_range <- range(boxplot.stats(y, coef = range )$stats)
		ss <- (y < boxplot_range[1]) | (y > boxplot_range[2])
		return(xx[ss,])	
	}
	outlier_df <-ddply(DATA, .(x), boxplot.outlier.data)
 
 
	# create propor x/y locations to handle over-laping dots...
	if(spread_text) {
		# credit: Greg Snow
		require(TeachingDemos)		
		temp_x <- boxdata_outlier_df[,"x"]
		temp_y1 <- boxdata_outlier_df[,"y"]
		temp_y2 <- temp_y1
		for(i in unique(temp_x))
		{
			tmp <- temp_x == i
			temp_y2[ tmp ] <- spread.labs( temp_y2[ tmp ], 1.3*strheight('A'), maxiter=6000, stepsize = 0.05) #, min=0 )
		}
 
	}
 
 
	# max(strwidth(c("asa", "a"))
	# move_text_right <- max(strwidth(outlier_df[,"label_name"]))	
 
	# plotting the outlier labels :)  (I wish there was a non-loop wise way for doing this)
	for(i in seq_len(dim(boxdata_outlier_df)[1]))
	{
		# ss <- (outlier_df[,"x"]  %in% boxdata_outlier_df[i,]$group) & (outlier_df[,"y"] %in% boxdata_outlier_df[i,]$y)
 
		# if(jitter_if_duplicate) {
			# ss <- (outlier_df[,"x"]  %in% boxdata_outlier_df[i,]$group) & closest.number(outlier_df[,"y"]  boxdata_outlier_df[i,]$y)
		# } else {
		ss <- (outlier_df[,"x"]  %in% boxdata_outlier_df[i,]$group) & (outlier_df[,"y"] %in% boxdata_outlier_df[i,]$y)
		# }
 
		current_label <- outlier_df[ss,"label_name"]
		temp_x <- boxdata_outlier_df[i,"x"]
		temp_y <- boxdata_outlier_df[i,"y"]		
		# cbind(boxdata_outlier_df,		temp_y2)
		# outlier_df
 
 
		if(spread_text) {
			temp_y_new <- temp_y2[i] # not ss			
			move_text_right <- strwidth(current_label)
			text( temp_x+move_text_right, temp_y_new, current_label, col = label.col)			
			# strwidth
			segments( temp_x+(move_text_right/6), temp_y, temp_x+(move_text_right*.47), temp_y_new )
		} else {
			text(temp_x, temp_y, current_label, pos = 4, col = label.col)
		}		
	}
 
	# outputing some of the information we collected
	list(boxdata = boxdata, boxdata_outlier_df = boxdata_outlier_df, outlier_df=outlier_df)
}

Un ejemplo:

boxplot.with.outlier.label (Lip~Egon,Cod, varwidth=T, ylab="Lipidos(ug/mg)", xlab="Estado gonada", data=P1_98gon, ylim=c(50,350))
sample.size <- tapply(P1_98gon$Lip, P1_98gon$Egon, length) ####Codigo  para determinar el tamaño muestral
ss.ch <- paste("N=", sample.size, sep="")            ####Codigo  pegar N= a los datos de tamaño muestral
mtext(ss.ch, at=1:length(unique(P1_98gon$Lip)), line=2, side=3)   ####Codigo  poner una linea de texto en el eje x en cada boxplot


Si queremos que no se superpongan los labels instalar el paquete {teachingDemos} y {plyr}

boxplot.with.outlier.label (Lip~Egon,Cod, varwidth=T, ylab="Lipidos(ug/mg)", xlab="Estado gonada", data=A99gon, ylim=c(50,350), spread_text = F) #añadimos spread_text=F

Fuente: R-Staistics blog