朋友友情赞助的stat442的笔记加一点点点tableau(坑了)的笔记.

R

NumbersOrder

#library(aplpack)
# stem(marks, scale = 0.5, width = 100)
#      the defalut in separated each group from 1-4 and 5-9. see page 7.
# stem.leaf(marks)

# the value of ordered values
wordLeaders <- c(62,65,65,65,66,67,68,72,73,73)
n <- length(wordLeaders)
p <- ppoints(n)
plot(x = p, y = wordLeaders, type = "o", lwd = 2,
     col = "grey10", xlab = "proportions", ylab = "height in inches",
     main = "heights of world leaders")
# n <- length(marks)
# p <- ppoints(n)
# plot(x = p, y = sort(marks), type = "o", lwd = 2, col = "grey10",
#     xlab = "proportion p", xlim = c(0,1), ylim = c(0, 100),
#     main = "empirical qunatile function for the final grades")

# quantile plot for marks data
data("sittingHeights", package = "qqtest")
with(sittingHeights,
     plot(percentCumulative, upperBound, type = "o", lwd = 2,
          xlim = c(0,100), ylim = c(27, 38),
          xlab = "percentage of women",
          ylab = "sitting heights in inches"))



# air quality measurements
qvals <- sort(airquality$Solar.R)
n <- length(qvals)
pvals <- ppoints(n)

plot(pvals, qvals,
     main = "solar radiation in central park",
     xlab = "cumulative proportion",
     ylab = "radiation (Langleys)")
# comments : very dense at the bottom, but not on the top

# old faithful geyser
data("geyser", package = "MASS")
qvals <- sort(geyser$duration)
n <- length(qvals)
pvals <- ppoints(n)
plot(pvals, qvals, cex = 1.5,
     main = "quantile plot of old faithful eruptions",
     xlab = "cumulative proportion",
     ylab = "duration (minutes)")

Histograms

drawbox <- function(x,y,...){
  xvals <- c(x, reve(x), x[1])
  yvals <- c(rep(y, each = 2), y[1])
  lines(xvals, yvals, ...)
}

#  the emprical cumulative distirbution
plot(x = sort(marks), y = ppoints(length(marks)), type ="o",
     col = "grey10", xlab = "final grade",
     ylab = "proportion", main = "empirical distribution function")

# in r, the function ecdf() produces the empirical distribution function from
# any collection of numbers
Fhat <- ecdf(marks)
plot(Fhat,type ="o",
     col = "grey10", xlab = "final grade",
     ylab = "proportion", main = "empirical distribution function")


# histogram
data("geyser", package = "MASS")
plot_colour <- "grey50"
with(geyser,
     hist(duration, col = plot_colour, xlim = extendrange(duration),
          main = "old faithful",xlab ="duration"))

# change the brak points     
with(geyser,
     hist(duration, col = plot_colour, xlim = extendrange(duration),
          breaks = c(0,0.5,1.0,2.75,3,3.5,4,4.5,5,6.0),
          main = "old faithful",xlab ="duration"))     

# change the starting point in histogram
savepar <- par(mfrow = c(2,3))
with(geyser,
     for (start in seq(0,0.4,0.1))
       {
       hist(duration,
            breaks = seq(start, 6.0,0.5),
            probability = TRUE,
            xlim = c(0, 6),
            ylim = c(0, 0.7),
            main = "",
            xlab = "duration")
     })

par(savepar)


# interactive histograms with loon package
library(loon)
data("geyser", package = "MASS")
with(geyser,
     {l_hist(duration,
             yshows = "density",
             showBinHandle = TRUE,
             xlab = "duration of eruption (minutes)",
             title = "old faithful geyser",
             showScales = TRUE,
             showGuides = TRUE,
             linkingGroup = "old faithful"
             )})

# show uncertainty due to bin locations
# blurness suggests uncertainty
data("geyser", package = "MASS")
Nstarts <- 5
start_values <- seq(0,0.4, length.out = Nstarts)
plot_colour_transparent <- adjustcolor(plot_colour, alpha.f = 1/Nstarts)
with(geyser,
     for (start in start_values)
       {if (start == 0)
         {hist(duration,
               breaks = seq(start, 6.0, 0.5),
               col = plot_colour_transparent,
               border = adjustcolor(plot_colour_transparent, 0.0),
               probability = TRUE,
               xlim = c(0,7), ylim = c(0,0.7),
               main = "",xlab = "duration")

       } else { #add the histogram to the existing plot
         hist(duration,
              breaks = seq(start, 6.0, 0.5),
              col = plot_colour_transparent,
              border = adjustcolor(plot_colour_transparent, 0.0),
              probability = TRUE,
              add = TRUE)
       }})

# averaging over histograms as the bins change
hist <- hist(geyser$duration,breaks=seq(0.0,6.0,0.5),
             plot = FALSE)
# str(hist) #gives the summary of the structure

# the function averages the shifted histograms
ash <- function(x, n_hists = 5, binwidth = NULL,
                type = "l", xlab = "x", main = "average shifted histogram",
                ylab = "averaged", lwd = 2, lty = 1, xlim, ylim,
                col = "grey50", fill, ...){
  xrange <- round(extendrange(x), 1)
  if (is.null(binwidth)) binwidth <- diff(xrange) / 10
  delta <- binwidth/n_hists
  xmin <- min(xrange) - binwidth
  xmax <- max(xrange) + binwidth
  breaks <- seq(xmin, xmax -binwidth, binwidth)
  n_breaks <- length(breaks)

  ## x values where density will be calculated and averaged
  xvals <- seq(xmin, xmax-delta, delta)
  n_xvals <- length(xvals)

  # total density will be numeric, this is where we store a total density for
  #    each xval
  density <- numeric(n_xvals)

  # the loop over the histograms to get densities, add them up
  # the indices of x have n_hists fewer values becasue there are
  # n_hists shifts in the summing
  x_indices <- seq(1,n_xvals-n_hists)
  for ( i in  seq(0, n_hists - 1)) {
    cur_hist <- hist(x, breaks=breaks+delta*i, plot = FALSE)
    density[i+x_indices] <-
      # current values
      density[i + x_indices] +
       ## add the density from the current histogram
      ## rep: repeatsnthe data returned by its first arguement
  rep(cur_hist$density, rep(n_hists, n_breaks - 1))
  }
  density <- density/n_hists
  # plot the averages, and check for missing arguments
  if (missing(xlim)) xlim <- xrange
  if (missing(ylim)) ylim <- extendrange(density)
  if (missing(fill)) fill <- col
  plot(rep(xvals, each = 2) + rep(c(-delta/2, delta/2), length(xvals)),
       rep(density, each = 2), type = type,
       xlim = xlim, ylim = ylim, xlab = xlab, ylab = ylab, main = main,
       lwd = lwd, lty = lty, col = col,
       bty = "n", ...)
  # and fill it in
  polygon(rep(xvals, each = 2) + rep(c(-delta/2, delta/2), length(xvals)),
          rep(density, each = 2), col =fill)
}


savePar <- par(mfrow = c(2,3))
with(geyser,
     {for(start in seq(0,0.4,0.1))
     {hist(duration,
           breaks=seq(start,6.0,0.5),
           col = plot_colour,
           probability = TRUE,
           xlim = c(0,6),
           ylim = c(0, 0.7),
           main = "",
           xlab = "duration"
           )  
     }
       ash(duration, main="",xlab = "duration", xlim = c(0,6),
           ylim=c(0,0.7))
       }
     )  
par(savePar)


# weight function, this returns the weights alone or the weights and information
# on how to draw them as a function of x
ashweights <- function(m,h,k=0){
  # k is the subscript of the central bin containing x, B_k,
  # m is the number of histograms
  # h is the bin width in each of the m histograms being averaged
  indices <- seq(1-m, m-1, 1)
  wts <- 1-abs(indices)/m
  if (!missing(h)){
    width <- h/m
    hts <- rep(c(0, wts, 0), each = 2)
    bins <- c(-m, rep(c(indices, m), each = 2), m+1) * width + k
    list(wts = wts, bins = bins, hts = hts)
  } else {
    list(wts = wts)
  }
}

# when m = 4, the weights are
ashweights(m = 4)$wts

# draw the weighted function
h <- 0.5
m <- 4
ashInfo <- ashweights(m = m, h = h, k = 0)
w_hts <- ashInfo$hts
B_bins <- ashInfo$bins

plot(B_bins, w_hts, type = "l",
     main = expression("ASH weight function for" ~ x %in% B[0]),
     ylab = "weights", xlab = "x",
     col = "firebrick")
lines(c(0,0), c(0, max(w_hts)), lty = 2)
lines(rep(h/m,2), c(0, max(w_hts)), lty = 2)
text(0.5*(h/m), 0.1, labels = expression(B[0]))

# the weight is highest for the count of x in B_0, and decreases symmetrically
#  for m-1 small bins on either side

visual comparisons


library(knitr)
classTable <- apply(Titanic,MARGIN = c(4,1), FUN = sum)
kable(classTable)

# number in each class is
classTotals <- apply(classTable, MARGIN = 2, FUN =sum)
classSurvival <- t(classTable["Yes", ]/classTotals)
rownames(classSurvival) <- c("survived")
kable(classSurvival)


# follw the rules for tables, a better way to present these numbers is
newTable <- 100 * round(classSurvival, 2)
newTable <- t(newTable)
# descendeing order
descendingOrder <- order(newTable, decreasing = TRUE)
newTable <- newTable[descendingOrder, ,drop = FALSE] # note drop argument
colnames(newTable) <- c("% survived")
kable(newTable, caption = "survival rates on the titanic by class")

# as lengths of bars, colour coded and labelled by class
nvals <- nrow(newTable)
col <- rainbow(nvals, alpha = 0.5)
barplot(newTable, col = col, horiz = TRUE, axes = FALSE,
        names.arg = c(""), xlab = colnames(newTable))
xlocs <- cumsum(newTable)
centres <- c(xlocs[1]/2, xlocs[1:(nvals - 1)] + diff(xlocs)/2)
text(centres, 0.75, labels = rownames(newTable))

barplot(newTable, col = col, horiz = TRUE, beside = TRUE,
        names.arg = c(""),
        xlab = colnames(newTable),
        legend.text = rownames(newTable))


# the titanic - survivors beside deaths
survivalProportions <- classTable
survivalProportions["Yes",] <- survivalProportions["Yes", ]/classTotals
survivalProportions["No",] <- survivalProportions["No", ]/classTotals
survivalCols <- adjustcolor(c("black", "grey"), 0.5)
barplot(survivalProportions, col = survivalCols,
        horiz = TRUE, beside = TRUE,
        xlab = "proportion of class", xlim = c(0,1))
legend("bottomright", title = "survival", fill = survivalCols,
       legend = rownames(survivalProportions))

# survivor and deaths within a common frame
# there ia a framing effect which makes proportions very easy to compare
# from one end or the other
# all are along common aligned scales

barplot(survivalProportions, col = survivalCols,
        horiz = TRUE, beside = FALSE,
        xlab = "proportion of class", space = 0)

barplot(apply(classTable, MARGIN = 2, FUN =sum),
        col = adjustcolor("steelblue", 0.5),
        xlab = "class", ylab = "number of passengers")
barplot(classTable, col=survivalCols, xlab = "class", ylab = "numebr of passengers")




library(loon.data)
data("SAheart", package = "loon.data")
kable(head(SAheart))

noFamilyHistory <- SAheart[,"famhist"] == "Absent"
FamilyHistory <- SAheart[,"famhist"] == "Present"
famHIsotryCol <- adjustcolor("steelblue", 0.5)
noHistoryCol <- adjustcolor("firebrick", 0.5)
boxplot(SAheart[noFamilyHistory, "sbp"], col = famHIsotryCol,
        main = "No family history", horizontal = TRUE)


# use formula notation y~x and a single data set, on common aligned scales
boxplot(sbp~famhist, data = SAheart, col = c(noHistoryCol, famHIsotryCol),
        main = "systolic blood pressure", horizontal = TRUE)

library(ggplot2)
ggplot(data=SAheart, mapping = aes(x=famhist, y = sbp)) +
  geom_boxplot(colour = c("firebrick", "steelblue"),
               fill = c("firebrick", "steelblue"),
               alpha = 0.5) +
  coord_flip()


hist(SAheart[noFamilyHistory, "sbp"], col = noHistoryCol,
     main = "No family history",
     xlab = "systolic blood pressure",
     xlim = extendrange(SAheart[,"sbp"]), ylim = c(0, 60))
# when use par(mfrow = c(2,1)) , means not on common scale
# put on common scale as follows
hist(SAheart[noFamilyHistory, "sbp"], col = noHistoryCol,
     main = "No family history",
     xlim = extendrange(SAheart[,"sbp"]))
hist(SAheart[FamilyHistory, "sbp"], col = famHIsotryCol,
     xlim = extendrange(SAheart[,"sbp"]), add = TRUE)



# reflected histograms
xrange <- extendrange(SAheart[,"sbp"])
breaks <- seq(xrange[1], xrange[2], length.out = 12)
h1 <- hist(SAheart[noFamilyHistory,"sbp"],
           breaks = breaks, plot = FALSE)
h2 <- hist(SAheart[FamilyHistory,"sbp"],
           breaks = breaks, plot = FALSE)
hmax = max(c(h1$counts, h2$counts))
h2$counts <- -h2$counts
hmin = -hmax
X = c(h1$breaks, h2$breaks)
xmax <- max(X)
xmin = min(X)
plot(h1, xlab = "systolic blood pressure", main = "comapring patients with(blue) and without(pink)",
     ylim = c(hmin, hmax),xlim = c(xmin, xmax),
     col = noHistoryCol)
lines(h2, col = famHIsotryCol)

# back to back histograms'
yrange <- extendrange(SAheart[,"sbp"])
breaks <- seq(yrange[1], yrange[2], length.out = 12)
h1 <- hist(SAheart[noFamilyHistory,"sbp"],
           breaks = breaks, plot = FALSE)
h2 <- hist(SAheart[FamilyHistory,"sbp"],
           breaks = breaks, plot = FALSE)
nbreaks <- length(breaks)
hmax = max(c(h1$counts, h2$counts))
h2$counts <- -h2$counts
hmin = -hmax
Y <- rep(h1$breaks, each = 2)
X <- c(0,rep(h1$counts, each = 2), 0)
plot(rep(0,2), range(Y), type = "l", col = "black",
     xlim = c(hmin, hmax), ylim = extendrange(Y),
     bty = "n",
     xlab = "frequency", ylab = "systolic blood pressure",
     main = "comparing patients with and without")
polygon(X, Y, col = noHistoryCol)
for (i in 1:nbreaks){lines(c(0, h1$counts[i]), c(rep(h1$breaks[i+1], 2)))}
Y <- rep(h2$breaks, each = 2)
X <- c(0,rep(h2$counts, each = 2), 0)
polygon(X, Y, col = famHIsotryCol)
for(i in 1:nbreaks){lines(c(0,h2$counts[i]), c(rep(h2$breaks[i+1],2)))}



# using facet with ggplot2
library(ggplot2)
ggplot(data =SAheart, mapping = aes(x =sbp)) +
  geom_histogram(bins =12,
                 colour = "grey50",
                 fill="white") +
  facet_grid(famhist~.)


# what can be compared via density estimates
savepAr <- par(mfrow=c(1,2))
densAbsent <- density(SAheart[noFamilyHistory,"sbp"], bw = "SJ")
densPresent <- density(SAheart[FamilyHistory,"sbp"], bw = "SJ")

plot(densAbsent, col = "firebrick", main = "no family history")
polygon(densAbsent, col = noHistoryCol)
plot(densPresent, col = "steelblue", main = "family history")
polygon(densPresent, col = famHIsotryCol)
par(savepAr)
# note the scales are not necesssarily identical

# compare both x,y aligned x only scales
savep <- par(mfrow=c(2,1))
xlim <- extendrange(SAheart[,"sbp"])
ylim <- extendrange(c(densAbsent$y, densPresent$y))
plot(densAbsent, col = "firebrick", main = "no family history",
     xlim = xlim, ylim = ylim)
polygon(densAbsent, col = noHistoryCol)
plot(densPresent, col = "steelblue",main = "family history",
     xlim = xlim, ylim = ylim)
polygon(densPresent, col = famHIsotryCol)
par(savep)


# common aligned scales
plot(densAbsent, col = "firebrick", xlab = "systolic blood pressure",
     main = "comparing a no family history with family history",
     xlim = xlim, ylim = ylim)
polygon(densAbsent, col = noHistoryCol)
lines(densPresent, col = "steelblue")
polygon(densPresent, col = famHIsotryCol)





# common aligned scales-reflected
densPresent$y <- - densPresent$y
ylim2 <- extendrange(c(densAbsent$y, densPresent$y))
plot(densAbsent, col = "firebrick", xlab = "systolic blood pressure",
     main = "comparing a no family history with family history",
     xlim = xlim, ylim = ylim2)
polygon(densAbsent, col = noHistoryCol)
lines(densPresent, col = "steelblue")
polygon(densPresent, col = famHIsotryCol)



# back to back
ylim3 <- extendrange(SAheart[,"sbp"])
xlim2 <- extendrange(c(densAbsent$y, densPresent$y))
xyswitch <- function(xy_thing){
  yx_thing <- xy_thing
  yx_thing$x <- xy_thing$y
  yx_thing$y <- xy_thing$x
  yx_thing
}
plot(xyswitch(densAbsent), col = "firebrick", xlab = "Density",
     ylab = "systolic blood pressure",
     main = "comparing a no family history with family history",
     xlim = xlim2, ylim = ylim3)
polygon(xyswitch(densAbsent), col = noHistoryCol)
lines(xyswitch(densPresent), col = "steelblue")
polygon(xyswitch(densPresent), col = famHIsotryCol)


# using facet with ggplot2
libarary(ggplot2)
ggplot(data=SAheart, mapping = aes(x=sbp, col = famhist)) +
  geom_density(colour = "grey50",
               fill = "black",
               alpha = 0.4,
               bw = "SJ") +
  facet_grid(famhist~.)



# simple points
xlim <- extendrange(SAheart[,"sbp"])
n <- nrow(SAheart)
col = rep(adjustcolor("firebrick", 0.2),n)
col[FamilyHistory] <- adjustcolor("steelblue", 0.2)
Y <- rep(1,n)
Y[FamilyHistory] <- -1
plot(SAheart[,"sbp"], y = Y, col = col, pch = 19, cex = 3,
     xlab = "systolic blood pressure",
     main = "comparing a no family history with family history",
     xlim = xlim, ylim = c(-2,2), bty = "n", yaxt = "n")

# simple points with jittering

xlim <- extendrange(SAheart[,"sbp"])
n <- nrow(SAheart)
col = rep(adjustcolor("firebrick", 0.2),n)
col[FamilyHistory] <- adjustcolor("steelblue", 0.2)
Y <- rep(1,n)
Y[FamilyHistory] <- -1
U <- runif(n, -0.3,0.3)
plot(SAheart[,"sbp"], y = Y + U, col = col, pch = 19, cex = 3,
     xlab = "systolic blood pressure",
     main = "comparing a no family history with family history",
     xlim = xlim, ylim = c(-2,2), bty = "n", yaxt = "n")

# quantile plots
savePar <- par(mfrow = c(1,2))
nAbsent <- sum(noFamilyHistory)
nPresent <- sum(FamilyHistory)
pAbsent <- ppoints(nAbsent)
pPresent <- ppoints(nPresent)

plot(pAbsent,sort(SAheart[noFamilyHistory, "sbp"]), type = "b",
     col = noHistoryCol, pch =19,
     xlab = "cumulative proportion",
     ylab = "systolic blood pressure",
     main = "no family histroy")

plot(pPresent,sort(SAheart[FamilyHistory, "sbp"]), type = "b",
     col = famHIsotryCol, pch =19,
     xlab = "cumulative proportion",
     ylab = "systolic blood pressure",
     main = "family histroy")
par(savePar)

# quantile plot common aligned scales via overlaying
ylim <- extendrange(SAheart[,"sbp"])
nAbsent <- sum(noFamilyHistory)
nPresent <- sum(FamilyHistory)
pAbsent <- ppoints(nAbsent)
pPresent <- ppoints(nPresent)

plot(pAbsent,sort(SAheart[noFamilyHistory, "sbp"]), type = "b",
     col = noHistoryCol, pch =19,
     xlab = "cumulative proportion",
     ylab = "systolic blood pressure",
     main = "no family histroy")
points(pPresent, sort(SAheart[FamilyHistory, "sbp"]),
       type = "b", col = famHIsotryCol, pch =19)

visual hypothesis testing


# a function which would achieve this mixing is
mixRandomly <- function(data){
  # note the data need not to be a data frame
  # it is expected to be a list with an x and y component (possibly different lengths)
  x <- data$x
  y <- data$y
  m <- length(x)
  n <- length(y)
  mix <- c(x,y)
  select4x <- sample(1:(m+n),
                     m,
                     replace = FALSE)
  new_x <- mix[select4x]
  new_y <- mix[-select4x]
  list(x = new_x, y = new_y)
}


myplot <- function(data) {
  xrange <- extendrange(c(data$x, data$y))
  breaks <- seq(xrange[1], xrange[2], length.out = 12)
  hx <- hist(data$x, breaks = breaks, plot = FALSE)
  hy = hist(data$y, breaks = breaks, plot = FALSE)
  yrange <- extendrange(c(hx$counts, hy$counts))

  plot(hx, main = "", axes = FALSE,
       ylab = "", xlab = "", xaxt = "n", yaxt = "n",
       ylim = yrange, xlim = xrange,
       col = adjustcolor("firebrick", 0.4))
  lines(hy, col = adjustcolor("steelblue", 0.4))
}

# apply it
library(loon.data)
data("SAheart", package = "loon.data")
Present <- SAheart[, "famhist"] == "Present"
x <- SAheart[Present, "sbp"]
y <- SAheart[!Present, "sbp"]
data = list(x= x, y=y)

savePar <- par(mfrow = c(1,2))
heads <- runif(1) > 0.5 # toss a coin for order
tails = !heads
if (heads) myplot(data) else myplot(mixRandomly(data))
if (tails) myplot(mixRandomly(data)) else myplot(data)

par(savePar)


# function to do a line up test

lineup <- function(data, showSubject = NULL, generateSubject = NUll,
                   trueLoc = NULL, layout = c(5,4)){
  nSubjects <- layout[1] *layout[2]
  if (is.null(trueLoc)) {trueLoc <- sample(1:nSubjects, 1)}
  if (is.null(showSubject)) {stop("need a plot function for the subject")}
  if (is.null(generateSubject)) { stop("need a function to generate subject")}

  # need to decide which subject to present
  presentSubject <- function(subjectNo){
    if (subjectNo != trueLoc) {data <- generateSubject(data)}
    showSubject(data, subjectNo)}

    # this does the plotting
    savePar <- par(mfrow= layout,
                   mar = c(2.5,0.1,0.3,0.1), oma = rep(0,4))
    sapply(1:nSubjects, FUN = presentSubject)
    par(savePar)

    # hide the true location but return information to reconstruct it
    hideLocation(trueLoc, nSubjects)
}

hideLocation <- function(trueLoc, nSubjects){
  possibleBaseVals <- 3:min(2*nSubjects, 50)
  # remove easy base values
  possibleBaseVals <- possibleBaseVals[possibleBaseVals != 10 &
                                         possibleBaseVals != 5]
  base <- sample(possibleBaseVals, 1)
  offset <- sample(5:min(5*nSubjects, 125), 1)

  # return the location information (trueLoc hidden)
  list(trueLoc <- paste0("log(",
                         base^(trueLoc +offset),
                         ", base = ", base,
                         ") - ", offset))
}

revealLocation <- function(hideLocation){eval(parse(text=hideLocation$trueLoc))}

# suspectLocation <- hideLocation(3,20)
# revealLocation(suspectLocation)


# have to fix myPlot to acccept a subject number
myPlot <- function(data, subjectNo) {
  xrange <- extendrange(c(data$x, data$y))
  breaks <- seq(xrange[1], xrange[2], length.out = 12)
  hx <- hist(data$x, breaks = breaks, plot = FALSE)
  hy <- hist(data$y, breaks = breaks, plot = FALSE)
  yrange <- extendrange(c(hx$counts, hy$counts))

  plot(hx, main = paste(   subjectNo), cex.main = 1.5, line = -0.75,
       ylab = "", xlab = "", xaxt = "n", yaxt = "n",
       ylim = yrange, xlim = xrange,
       col = adjustcolor("firebrick", 0.4))
  lines(hy, col = adjustcolor("steelblue", 0.4))
}

# we need only execute the test as follows
data <- list(x = x, y=y)
lineup(data,
       generateSubject = mixRandomly,
       showSubject = myPlot,
       layout = c(4,5))



# back to back displays
back2back <- function(data, subjectNo) {
  ylim <- extendrange(c(data$x, data$y))
  Xdensity <- density(data$x, bw = "SJ")
  Ydensity <- density(data$y, bw = "SJ")
  Ydensity$y <- - Ydensity$y
  xlim <- extendrange(c(Xdensity$y, Ydensity$y))
  xyswitch <- function(xy_thing){
    yx_thing <- xy_thing
    yx_thing$x <- xy_thing$y
    yx_thing$y <- xy_thing$x
    yx_thing
  }
  plot(xyswitch(Xdensity), col = "firebrick", xlab = "",
       ylab = "",
       main = paste(subjectNo),cex.main = 1.5, adj = 0.75,line = -0.85,
       xaxt = "n", yaxt = "n",
       xlim = xlim, ylim = ylim)
  polygon(xyswitch(Xdensity), col = adjustcolor("firebrick", 0.4))
  lines(xyswitch(Ydensity), col = "steelblue")
  polygon(xyswitch(Ydensity), col = adjustcolor("steelblue", 0.4))

}

data <- list(x = x, y=y)
lineup(data,
       generateSubject = mixRandomly,
       showSubject = back2back,
       layout = c(4,5))

# quantiles display
quantiles <- function(data, subjectNo){
  ylim <- extendrange(c(data$x, data$y))
  n_x <- length(data$x)
  n_y <- length(data$y)
  p_x <- ppoints(n_x)
  p_y <- ppoints(n_y)

  plot(p_x, sort(data$x), type = "b",
       pch = 19, col = adjustcolor("firebrick", 0.4),
       cex = 2, ylim = ylim,
      xlab = "", ylab = "", main = paste(subjectNo),
      cex.main = 1.5, adj = 0.75,line = -0.85,
      xaxt = "n", yaxt = "n")
  points(p_y, sort(data$y), type = "b",
         col = adjustcolor("steelblue", 0.4),pch = 19,
         cex = 2)
}

lineup(data,
       generateSubject = mixRandomly,
       showSubject = quantiles,
       layout = c(4,5))

# we might use boxplot if we are only interested in comparing locations and
#   possible scales and symmetry

boxplots <- function(data, subjectNo){
  y <- c(data$x, data$y)
  x <- c(rep(0, length(data$x)),
         rep(1, length(data$y)))
  boxplot(y~x, data = cbind(x,y),
          col = adjustcolor(c("firebrick", "steelblue"), 0.4),
          main = paste(subjectNo),
          cex.main = 1.5, adj = 0.75, line = -0.75,
          xlab = "", ylab = "", xaxt = "n", yaxt = "n")
}

hypothetical generative

# this function will return x and y data that have the same median(0) and
# interquartile range(1): same slope in the middle

matchData <- function(data){
  x <- data$x
  y <- data$y
  list (x = scale(x, center = median(x), scale=IQR(x)),
        y = scale(y, center = median(y), scale=IQR(y)))
}

lineup(data,
       showSubject = back2back,
       layout = c(4,5),
       generateSubject =
         function(data) {
           data <- matchData(data)
           data <- mixRandomly(data)
           matchData(data)
})




# compare to a specific shape
# rdist(): will generate n pseudo-random values from the hypothesized generative
#  distribution (for some location and scale), then data matched on location (median)
#  and scale(IQR) can be geenrated using:
getMatchData <- function(x,
                         rdist = rnorm,
                         matchScale = TRUE,
                         matchLocation = TRUE){
  new_x <- rdist(length(x))   # geenrate new data Hershey
  # now match the new data to have the same location and scale as the old data
  if (matchScale) {new_x <- new_x * (IQR(x)/IQR(new_x))}
  if (matchLocation) {new_x <- new_x - median(new_x) + median(x)}
  new_x
}

# first, get the actual data
x <- beaver1[,"temp"]

# construct a pairing of it with a randomly geenrated sample from any Gaussian distribution
# and match the location and scale
data <- list(x = x, y = getMatchData(x))

# then this data can now be used in a line up test,
lineup(data,
       showSubject = back2back,
       layout = c(4,5),
       generateSubject =
         function(data) {
           list(x = getMatchData(data$x),
                y = getMatchData(data$y))
         }
       )

# if from other distribution, like t3
rdist <- function(n) rt(n, df = 3)
data <- list(x = x, y = getMatchData(x, rdist = rdist))

## the above method was compared to several other pairs of x and y where both were
# generated from the hyppthesized dsitribuiotn
# the extra variation introduced by random generation of both x and y under the null hypothesis might
# be making the detemination more difficult
# the variation might be reduced if we instead use a single representative set of values
# for the comparison in each graphic.


# sample quantiles vs. theoretical quantiles

library("MASS")
attach(geyser)
plot_colour <- adjustcolor("black", 0.3)
qqnorm(duration, pch = 19, col = plot_colour)
# here geyser duration data with alpha blending to help reveal point density

savePar = par(mfrow=c(1,2))
qqnorm(SAheart[noFamilyHistory, "sbp"], pch = 19,
       col = adjustcolor("firebrick", 0.4),
       main = "no family history")
qqnorm(SAheart[FamilyHistory, "sbp"], pch = 19,
       col = adjustcolor("steelblue", 0.4),
       main = "family history")
par(savePar)



# self calibrating qqplots
library(qqtest)
qqtest(duration)
qqtest(duration, xAxisAsProbs = TRUE,yAxisAsProbs = TRUE)


# ordering comparisons
library(loon.data)
data("SAheart")

# getting the comparisons of interest
# get the groups
groups <- with(SAheart, split(sbp, list(famhist, chd)))
kable(t(names(groups)))

# now ordering the groups
ord <- c("Present.No", "Present.Yes", "Absent.Yes",
         "Absent.No", "Present.No")
# match the colours to the famliy history
cols <- adjustcolor(c("steelblue", "steelblue", "firebrick",
                      "firebrick", "steelblue"), 0.5)
boxplot(groups[ord], col = cols)


# layout via graph structure
library("PairViz")
library(loon.data)
data(cancer)
str(cancer)

# we can separate the survival times by which organ is affected
organs <- with(cancer, split(Survival, Organ))
# and record their names for use later
organNames <- names(organs)
# the structure of the organs data
str(organs)

# getting Eulerians and Hamiltonians
ord <- eulerian(5)

library(colorspace)
# getting a rainbow of colours but choose chromaticity of 50 to dull them
cols <- rainbow_hcl(5, c = 50)
boxplot(organs[ord], col = cols[ord],
        ylab = "Survival time", main = "cancer treated by vitamin C")

# logout via graph structure
# taking square roots should make the data look a liitle more symmetric
sqrtOrgans <- with(cancer, split(sqrt(Survival), Organ))

boxplot(sqrtOrgans[ord], col = cols[ord],
        ylab = expression(sqrt("Survival time")),
        main = "cancer treated by vitamin C")

# hamiltonian decompositions
ordHam <- hpaths(5) # here each hamiltonian is a row of the matrix'
# for our purpose, we would like the hamiltonians to be joined togehter
ordHam <- hpaths(5, matrix = FALSE)



# run a statistical test comapring each pair and use the p-value
# to order. e.g. test the equality of each pair of means(assuming normlaity)
# pairwise.t.test() does this and corrects the p-values for simultaneity
test <- with(cancer,
             pairwise.t.test(sqrt(Survival), Organ))
pvals <- test$p.value
round(pvals, 4)

# the pairviz package can handle graphs with weights
ngrps <- length(organNames)
# build matrix of weights
weights <- matrix(0, nrow = ngrps, ncol= ngrps)
rownames(weights) <- organNames
colnames(weights) <- organNames
weights[2:ngrps, 1:(ngrps-1)] <- pvals
round(weights, 4)
# right size matrix but need to be symmetric by filling the upper triangle to
# match lower
diag(weights) <- 0
for (i in 1:(ngrps - 1)) {
  for (j in (i+1):ngrps) {
    weights[i,j] <- weights[j,i]
  }
}
round(weights, 4)

# greedy eulerians
# low to high
low2highEulord <- eulerian(weights);
colnames(weights)[low2highEulord]
# high to low
high2lowEulord <- eulerian(- weights)
colnames(weights)[high2lowEulord]

# weighted hamiltonian decompositions
low2higihHamord <- weighted_hpaths(weights)
# high to low
high2lowHamord <- weighted_hpaths(- weights)


# having weigthed graph g
library(Rgraphviz)
g <- mk_complete_graph(weights)
# par(mar = c(bottom, left, top, right))
par(mar = c(1, 1, 1, 1))
plot(g, "circo")



#######################################################
########### the code used to install packages in bioManager
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("Rgraphviz")
#######################################################

# remove all edges in the top hald the edges weights
wvals <- weights[upper.tri(weights)]
# the median
q <- quantile(wvals, 0.5)
# trim the graph via the pairviz function dn_graph()
g1 <- dn_graph(g,q,`<=`)
par(mar = c(1, 1, 1, 1))
plot(g1, "circo")

# this subgraph g1 is not even (some nodes have an odd numebr of edges)
# so it must be made even by adding extra edges to get an Eulerian
g1EulOrd <- eulerian(g1)

# back to the SAheart data
library(PairViz);library(loon.data);data("SAheart")
# building the graph
nodes <- names(groups)
SAheartGraph <- new("graphNEL", nodes = nodes, edgemode = "undirected")
SAheartGraph <- addEdge("Present.Yes", "Absent.No", SAheartGraph)
SAheartGraph <- addEdge("Present.Yes", "Absent.Yes", SAheartGraph)
SAheartGraph <- addEdge("Absent.Yes", "Absent.No", SAheartGraph)
SAheartGraph <- addEdge("Absent.No", "Present.No", SAheartGraph)

ord <-eulerian(SAheartGraph)
cols <- adjustcolor(rep(c("firebrick", "steelblue"), 2), 0.5)
names(cols) <- nodes
savePar <- par(mfrow=c(1,2), oma = rep(6,4))
boxplot(groups[ord], col= cols[ord], main = "family hisotry and chd")
plot(SAheartGraph, "circo")
par(savePar)

time order

# time series data
library(loon)
p <- l_plot(sunspot.month, color = "grey", size = 1,
            showScales = TRUE)
# connect the dots
l_layer_line(p, x = time(sunspot.month), y = sunspot.month,
             color = "grey", linewidth = 2,
             index = "end")

l_layer_hide(p, "model")

# show a window of a couple of years
window(sunspot.month, start=c(1960, 1), end = c(1961, 12))
end(sunspot.month)
start(sunspot.month)


# cycle: group the data by season or quarterly phase by using the cycle() function
window(cycle(sunspot.month), start = c(1960, 1), end = c(1960, 12))
quarter <- 1 + ((cycle(sunspot.month) - 1) %/% 3)
monthplot(sunspot.month, col = "grey50", col.base = "red",
          lwd.base = 3, xlab = "quarter", phase = quarter,
          labels = c("first", "second", "third", "fourth"))
# we might explore how well the value of one month might be predicted by the immediately
# prior month, that is how well might any value be predicted from knowing
# only its immediate predecessor
plot(x = sunspot.month[-length(sunspot.month)], y = sunspot.month[-1],
     col = adjustcolor("firebrick", 0.15), pch = 19,
     main = "sunspot activity lagged by one month")

# lag plots: can lag the data by any number of months,
lag.plot(sunspot.month,
         col = adjustcolor("firebrick", 0.15), pch = 19,
         main = "sunspot activity lagged by up to 12 months",
         lags = 12)

freq <- frequency(sunspot.month)
lag.plot(sunspot.month,
         col = adjustcolor("firebrick", 0.15), pch = 19,
         main = "sunspot activity lagged by up to 15 months",
         set.lags = freq*(1:15))

# jumped by 4 months
lag.plot(sqrt(sunspot.month),
         col = adjustcolor("firebrick", 0.15), pch = 19,
         main = "sunspot activity^(1/2) lagged 84 tp 180 by 4",
         set.lags = seq(84,180, 4))

# identify every 11 years of the series as a cycle.
# determine the starting year of each 11 year cycle
timeseries <- sqrt(sunspot.month)
startYears <- seq(start(timeseries)[1], end(timeseries)[1], 11)
# number of cycles
ncycles <- length(startYears)
cycleNums <- rep(1:ncycles, each = 132)
# only need as many as we have data points
cycleNUms <- cycleNUms[1:length(timeseries)]


# each month will have a position number within its cycle
cyclePositions <- rep(1:132, ncycles)
cyclePositions <- cyclePositions[1:length(timeseries)]
head(cyclePositions)

library(RColorBrewer)
cols <- adjustcolor(rep(brewer.pal(9, name = "Set1"),
                       length.out = ncycles), alpha.f = 0.5)
opt <- par(bg="grey95")
plot(x = time(timeseries), y =as.vector(timeseries),
     type = "b", col=cols[cycleNUms], lwd = 2, pch = 19,
     xlab = "Time", ylab = "sqrt monthly sunspots",
     main = "sunspot activity")
par(opt)


# comparing groups by overlaying them
opt <- par(bg = "grey95")
data <- window(timeseries, start = c(startYears[1], 1),
                end = c(startYears[2] - 1, 12),
                frequency = 12)
plot(1:length(data), data,
     type = "l", col=cols[1], lwd = 2,
     xlab = "Month number in 11 year sequence", ylab = "sqrt monthly sunspots",
     main = "all 11 tear sequence: 1749-1983")

for ( i in 2:(ncycles - 1)) {
 data <- window(timeseries, start = c(startYears[i], 1),
                 end = c(startYears[i+1] - 1, 12),
                 frequency = 12)
 lines(1:length(data), data, col = cols[i], lwd = 2)
}

# tidy up the last partial cycle
data <- window(timeseries, start = c(startYears[ncycles], 1),
               end = end(timeseries),
               frequency = 12)
lines(1:length(data), data, col = cols[ncycles], lwd = 2)
par(opt)



#### an interactive display
library(loon)

# plot the original data, marking 11 year cycles in colour
p <-l_plot(x =time(timeseries), y=as.vector(timeseries),
           ylabel="Mean sqrt monthly sunspots",
           xlabel="Time",
           title="Sunspot activity",
           linkingGroup="sunspots",
           color = cols[cycleNums], size=1,showGuides=TRUE)

#  And all the cycles over top of each ot
cycleplot <-l_plot(x = cyclePositions,
                  y=as.vector(timeseries),
                  ylabel="Mean sqrt monthly sunspots",
                  xlabel="Month number in 11 year sequence",
                  title="11 year sequences",linkingGroup="sunspots",
                  color = cols[cycleNums], size=1,showGuides=TRUE)

# Adding a histogram in the same linking group allows
#    all to the# plots to be linked together
h <-l_hist(x= cycleNums,xlabel="Cycle number",
           title="Sunspot  cycles",linkingGroup="sunspots",
           color = cols[cycleNums],showStackedColors =TRUE,
           showGuides=TRUE)

# now add the lines as layers
data <- window(timeseries, start = c(startYears[ncycles], 1),
               end = c(startYears[2] - 1, 12),
               frequency = 12)
l_layer_line(cyclesplot, x = 1:length(data), y = data, color = cols[1],
             label = paste(start(data)[1], end(data)[1], sep="-"), indx = "end")
l_layer_line(p, x = time(data), y = data, color = cols[1],
             label = paste(start(data)[1], end(data)[1], sep="-"), indx = "end")

for (i in 2:(ncycles - 1)){
  data <- window(timeseries, start = c(startYears[i], 1),
                 end = c(startYears[i+1] - 1, 12),
                 frequency = 12)
  l_layer_line(cyclesplot, x = 1:length(data), y = data, color = cols[i],
               label = paste(start(data)[1], end(data)[1], sep="-"),
               indx = "end")
  l_layer_line(p, x = time(data), y = data, color = cols[i],
               label = paste(start(data)[1], end(data)[1], sep="-"),
               indx = "end")

}

# tidy up the last partial cycles
data <- window(timeseries, start = c(startYears[ncycles], 1),
               end = end(timeseries),
               frequency = 12)
l_layer_line(p, x = time(data), y = data, color = cols[ncycles],
             label = paste(start(data)[1], end(data)[1], sep="-"),
             indx = "end")

time series decomposition

plot(co2)


# explore it interactively
library(loon)
getDates <- function(ts){
  mapply(FUN = function(x,y) {paste(x,y)}, floor(time(co2)),
         month.name)
}
data = co2

p <- l_plot(data, title="Muna loa atmospheric CO2",
            ylabel = "concentration (ppm)",
            xlabel = "Date", linkingGroup = "co2", color = "grey20",
            showScales = TRUE, showGuides = TRUE,
            itemLabel = getDates(data), showItemLabels = TRUE)

# another way to get the x and y coordinates
xy.raw <- xy.coords(data)
l_layer_line(p, x = xy.raw$x, y = xy.raw$y,
             clor = "grey50", linewidth = 3, index = "end",
             label = "connect the dots")




# a couple of helper functions
getYears <- function(ts) {unique(floor(time(ts)))}
getYears(co2)

# get the number of the month
# this will actually work more generally using frequency()
getMonthNos <- function(ts){1:frequency(ts)}
getMonthNos(co2)

# we can now computate the averages, first over all months for each year, and
#   second over all years for each month
getAves <- function(x, by = "year") {
  years <- getYears(x)
  monthNos <- getMonthNos(x)
  nyears <- length(years)
  nmonths <- length(monthNos)
  if (! (by %in% c("year", "month"))){
    stop("unkwown value for by = ",
         by,
         "; by must be one of {\"year\",\"month\"}")
         }
  if (by == "year") {
    aves <- sapply(1:nyears,
                   FUN = function(i) {
                     mean(window(x,
                                 start = c(years[i], monthNos[1]),
                                 end = c(years[i], monthNos[nmonths])))
                   })
    aves <- data.frame(aves = aves, row.names = years)
  } else {
    aves <- sapply(1:nmonths,
                   FUN = function(i) {
                     mean(window(x,
                                 start = c(years[1], monthNos[i]),
                                 end = c(years[nyears], monthNos[i]),
                                 frequency = 1))
                   })
    aves <- data.frame(aves = aves, row.names = month.abb[monthNos])
  }
  aves
}



head(getAves(co2, by = "month"))

# plot them
plot(getYears(co2), getAves(co2, by = "year")$aves,
     type = "l", xlab = "Year",
     ylab = "annual average over ll months")
plot(getAves(co2, by = "month")$aves,
     type = "l", xlab = "Year",xaxt = "n",
     ylab = "monthly average over all years")
axis(1,at = getMonthNos(co2), labels = month.abb) #month.abb: month abbreviations



# reduce the white space between plots
savePar <- par(mfrow = c(3,1),
               mar = c(2,4,1.5,0.5), oma = rep(0.1,4))



# decompositions of a time series
# the default decomposition is an additive one
co2_decomp <- decompose(co2)
# having structure
str(co2_decomp)
plot(co2_decomp)


# we can see how the local fits behave as we change various elements
#  xloc is the location where we are fitting
#  proportionRange stands for proportion of the x range, which determine
#     what is meant by local
# all weights are 1 or zero

ourdata <- data.frame(x = xy.raw$x, y = xy.raw$y)
unitWeight <- function(xloc, x, proportionRange = 0.5){
  xhalf <- diff(range(x)) * proportionRange/2
  wts <- rep(0, length(x))
  wts[x < xloc + xhalf & x > xloc - xhalf] <- 1
  wts
}

# we have our data with x and y as before and need weights
xloc <- 0.5
weights <- unitWeight(xloc, xy.raw$x, proportionRange = 1)
# and use thes to fit a line

fitw <- lm(y~x, data = ourdata, weights = weights)


# use a weight function shaped like a Gaussian
GaussianWeight <- function(xloc, x, h = 1){
  # normal density
  dnorm(x, mean = xloc, sd = h)
}




# the function that will produce a smooth cureve from minimizing the locally
#   weighted sum of squares, or LoWeSS.
# it requires:
#   x,y:  - the data
#   xloc : - x locations at which the estimate is to be calculated
#   span: - a bandwidth
#   weightFn: - a weigthed function ,default will be GaussWeight
#   nloc: - number of equi-spaced locations at which estimate will be calculated
#               if xloc = NULL, ignored otherwise

LoWeSS <- function(x, y, xloc = NULL, span = 0.1, weightFn = GaussianWeight,
                   nlocs = 200){
  data <- data.frame(x= x, y = y)
  xrange <- extendrange(x)
  bandwidth <- (diff(xrange)*span)/4

  if (is.null(xloc)){
    xloc <- seq(xrange[1], xrange[2], length.out = nlocs)
  }
  estimates <- vector("numeric", length = length(xloc))

  for (i in 1:length(xloc)){
    weights <- weightFn(xloc[i], x, bandwidth)
    fit <- lm(y~x, data = data, weights = weights)
    pred <- predict(fit, newdata = data.frame(x= xloc[i]))
    estimates[i] <- pred
  }
  # return the estimates
  list(x = xloc, y = estimates)
}

x <- xy.raw$x
y <- xy.raw$y
smooth <- LoWeSS(x,y, nlocs = 1000, span = 0.75)
plot(x,y,col="grey50", pch = 19, main = "our data", type = "p")
lines(smooth$x, smooth$y, col = "steelblue", lwd = 3)


# the data
plot(x, y , col = "grey50", pch = 19, main = "relating y to x")


# the LoWeSS estimate
lines(smooth$x, smooth$y, col = "steelblue", lwd = 3)
# add the true mu(x)
xordered <- sort(x)
library(stpm)
lines(xordered, mu(xordered), col = "firebrick", lwd = 3, lty = 2)
      # mu function is in library(stpm)

# locally weigthed sums of squares
fit <- loess(y~x, data = ourdata)
plot(x,y,col = "grey50", pch =19, main = "relating y to x")  
# get the values predicted by the loess
pred <- predict(fit)
# get the order of the x
ord <- order(x)
lines(x[ord], pred[ord], lwd = 2, col = "steelblue")  


# we could also reduce the span being used here (default is 0.75)
fit_Large <- loess(y~x, data = ourdata, span = 0.15)
pred_Large <- predict(fit_Large)
plot(fit)


# using spans to decompose
# first fit the large span
fit <- loess(y~x, data = ourdata, span = 0.75)
# from the fit we extract the estimated residuals
r_Large <- fit_Large$residuals

# this is another data set that we can plot
plot(x, r_Large, col = "grey50", pch =19,
     main = "s_small fit of r_large on x")
# and we can fit this with a smaller span
data_resids <- data.frame(r = r_Large, x = x)
fit_small <- loess(r~x, data = data_resids, span = 0.2)

# get the values predicted by the loess
pred_small <- predict(fit_small)
# need the order of x
ord <- order(x)
lines(x[ord], pred_small[ord], lwd = 2, col = "steelblue")


r_Remains <- fit_small$residuals
data.remains <- data.frame(r = r_Remains,  x= x)
fit_Tiny <- loess(r~x, data = data.remains, span = 0.5)
# get the values predicteed by the loess
pred_Tiny <- predict(fit_Tiny)


# putting together for the various fitted smooths we have adecomposition of the
#  observed values as
opt <- par(mfrow = c(3,1), mar = c(0,4,4,2) + 0.1)

plot(x[ord], pred_Large[ord], xlab = "", ylab = "", xaxt = "n",
     col = "grey50", pch =19, cex = 0.5, main = "large span smooth")

# add a range of residuals rectangle
xleft <- 1.01
xright <- 1.02
yDelta <- sd(r_Remains)
rectCol <- adjustcolor("red", 0.5)
ymiddle <- mean(pred_Large)
rect(xleft, ymiddle - yDelta, xright, ymiddle+yDelta,
     border = "grey50", col = rectCol)
plot(x[ord], pred_small[ord], xlab = "", ylab = "", xaxt = "n",
     col = "grey50", pch = 19, cex= 0.5, main = "small span smooth")
# add the same range rectangle
ymiddle <- mean(pred_small)
rect(xleft, ymiddle - yDelta, xright, ymiddle+yDelta,
     border = "grey50", col = rectCol)

plot(x[ord], pred_small[ord], xlab = "", ylab = "", xaxt = "n",
     col = "grey50", pch = 19, cex= 0.5, main = "remaining residuals")
ablines(h =0, col = "lightgrey")

# add the same range rectangle
ymiddle <- mean(r_Remains)
rect(xleft, ymiddle - yDelta, xright, ymiddle+yDelta,
     border = "grey50", col = rectCol)

rect(xleft, ybottom, xright, ytop, border = "grey50", col = rectCol)
par(opt)



# the seasonal components now vary a little more over the years
# there are similar arguements t.window and t.degree for the loess smoothing of the
# trend component
plot(stl(co2, s.window = 7, s.degree = 1, t.degree = 1))

library("loon")
l_plot(stl(co2, s.window = 7, s.degree = 1, t.degree = 1))
demo("l_timeseries")

transforming data

x <- round(rnorm(10), digits = 2)
# the ranks (in order smallest to largest)
rank(x)

# the family of power transformation
powerfun <- function(x, alpha){
  if (alpha == 0) {
    log(x)
  } else {
    (x^alpha - 1)/alpha
  }
}

library(loon.data); data(SAheart, package = "loon.data")
library(qqtest)
data <- SAheart$sbp
savePar <- par(mfrow = c(1,2))
# note alpha is in descending order here
for (alpha in c(2,1,1/2,1/3,0,-1/3,-1/2,-1,-2)) {
  newdata <- powerfun(data, alpha)
  dens <- density(newdata, bw = "SJ")
  h <- hist(newdata, plot = FALSE)
  ylim <- extendrange(c(dens$y, h$density))
  plot(h, ylim = ylim, freq = FALSE, yaxt = "n", xaxt = "n",
       main = "SAheart", xlab = paste0("(sbp)^", round(alpha, digits = 2)),
       ylab ="")
  polygon(dens, col = adjustcolor("grey50", 0.3))
  qqtest(newdata, main = "normal qqplot", legend = FALSE, cex = 0.5)
}
par(savePar)


# an interactive exploration of chnaging alpha
library(loon)
# something to scale the data
scale01 <- function(x) {
  (x - min(x))/diff(range(x) + 1)
}

## setting up the interactive plots that will respond to chaning power
power <- function(x, xlab = NULL, labels = NULL, lowest = -5, highest = 5, ...){
  if (is.null(xlab)) {xlab <- "x"}
  if (is.null(labels)) {labels <- as.character(1:length(x))}

  #scale and order the x values
  xs <- scale01(x)
  ord <- order(xs)
  xs <- xs[ord]
  labels <- labels[ord]
  # normal quantiles
  qs <- qnorm(ppoints(length(x)))
  # qqplot
  p <- l_plot(x = qs, y = xs, itemLabel = labels,
              ylabel = xlab, xlab = "Gaussian qunatiles", ...)
  # now we iwll build a more complex interface a histogram plus a slider for the
  # power
  # the top level window
  tw <- tktoplevel()
  # create and add the histogram to the top window
  hx <- l_hist(xs, xlabel = xlab, yshows = "density", parent = tw, ...)
  # now create a slider whose value will be the alpha of the power
  #  transformation
  alpha_x <- tclVar("1")
  # slider for x
  sx <- tkscale(tw, orient = 'horizontal', label = 'power x',
                variable = alpha_x, from = lowest, to = highest,
                resolution = 0.1)
  # pack the histogram and slider into the same window
  tkpack(sx, fill = 'x', side = 'bottom')
  tkpack(hx, fill = 'both', expand = TRUE)

  # now we need an update function to evaluate when the slider changes

  # this is the power function that will change with the slider
  update <- function(...){
    alphax <- as.numeric(tclvalue(alpha_x))
    newx <- sort(scale01(powerfun(x,alphax)))
    # update the label
    xlabel <- if(alphax ==0){paste0("log(",xlab,")")
      } else{
          if (alphax == 1){
            xlab
          } else {
            paste0(xlab, "^", alphax)
          }
      }
    xrng <- range(newx)
    # update the qqplot p
    l_configure(p, y = newx, ylabel = xlabel, xlabel = "Gaussian quantiles")
    # update the histogram
    binwidthx <- hx['binwidth']
    l_configure(hx, x = newx, binwidth = binwidthx, xlabel = xlabel)
    l_scaleto_world(hx)
  }
  # configure the scale to call "update"
  tkconfigure(sx, command = update)
  invisible(p)
}



with(SAheart,
     p <- power(x= sbp, labels = paste("Patient", rownames(SAheart)),
                linkingGroup = "SAheart"))

# turkey's ladder of power transformation
data <- rchisq(200, 2)
power(data) # Create a Power Link Object
# if it is concenrated on lower values, then move the power lower on the ladder
# otherwise, in the reverse order

# power transformation
library(MASS)
data(mammals)

# straightening plots via power transformation
power_xy <- function(x, y = NULL, xlab = NULL, ylab = NULL,
                     from= 5, to = -5,...){
  if (is.null(xlab)) {xlab <- "x"}
  if (is.null(ylab)) {ylab <- "y"}

  scale01 <- function(x) {
    (x - min(x))/diff(range(x) + 1)
  }
  tt <- tktoplevel() # this is about dynamic graphic in r (你可以看一下)
  # https://www.stat.berkeley.edu/~spector/s244/Dyngraph.html
  # The basic widget in Tk is the frame, which is essentially a container for
  # any other widgets you wish to use.
  # You can have as many frames as you want, but each individual entity which
  # appears on the screen must be in a "top level" frame,
  # created with the tktoplevel function.
  # When using Tk, you first create a widget (specifying its appearance, the
  # command to be executed when the widget is activated, etc.), and then you
  # specify how it should appear in its parent frame. While Tk provides a number
  # of different schemes for achieving this, we'll discuss the technique known as
  # "packing". The basic idea is to group together widgets (and accompanying
  # labels, if appropriate) into a series of frames, and then to call the tkpack
  # function to put the widgets in the frames, then to finally pack the frames
  # into a main frame which is contained in a top-level widget
  # By default, objects are backed into a frame from top to bottom, with each
  # object being centered within its parent frame. You can control how widgets
  # get packed into a frame by the order in which you call tkpack, and the
  # optional side= argument which can take the values "left", "right", "top"
  # or "bottom"
  xs <- scale01(x)
  ys <- scale01(y)

  # scatterplot
  p <- l_plot(x= xs, y = ys, xlabel = xlab, ylabel = ylab,
              parent = tt,...)
  # a valid Tk parent widget path. When the parent widget is specified (i.e.
  #   not NULL) then the plot widget needs to be placed using some geometry
  #   manager like tkpack or tkplace in order to be displaye
  fit <- lm(ys~xs)
  # layer a fitted line
  xrng <- range(xs)
  yhat <- predict(fit, data.frame(xs = xrng))
  line <- l_layer_line(p, x= xrng, y = yhat, linewidth = 2, index = "end")

  # histogram on each
  hx <- l_hist(xs, xlabel = xlab, yshows = "density")

  # setting up separate sliders for each power transformation
  alpha_x <- tclVar('1')
  alpha_y <- tclVar('1')
  sx <- tkscale(tt, orient = 'vertical', label = 'power x', variable = alpha_x,
                from = from, to = to, resoluation = 0.1)
  sy <- tkscale(tt, orient = 'vertical', label = 'power x', variable = alpha_y,
                from = from, to = to, resoluation = 0.1)
  tkpack(sy, fill = 'y', side = 'right')
  tkpack(p, fill = 'both', expand = TRUE)

  update <- function(...) {
    alphax <- as.numeric(tclvalue(alpha_x))
    alphay <- as.numeric(tclvalue(alpha_y))
    newx <- scale01(powerfun(x, alphax))
    newy <- scale01(powerfun(y, alphay))
    xlabel <- if (alphax == 0) {paste0("log(", xlab,")")} else{
      if (alphax== 1) {xlab}else {
        paste0(xlab, "^", alphax)
      }
    }
    ylabel <- if (alphay == 0) {paste0("log(", ylab,")")} else{
      if (alphay== 1) {ylab}else {
        paste0(ylab, "^", alphay)
      }
    }
    # fit the line to the new data
    fit.temp <- lm(newy~newx)
    xrng <- range(newx)
    ## get the fitted line
    yhat <- predict(fit.temp, data.frame(newx=xrng))
    # update the line values
    l_configure(line, y= yhat, x=xrng)
    # reconfigure the points
    l_configure(p, x = newx, y= newy, xlabel = xlabel, ylabel = ylabel)
    # and the histogram
    binwidthx <- hx['binwidth']
    binwidthy <- hx['binwidth']
    l_configure(hx, x = newx, binwidth = binwidth,
                xlabel = xlabel)
    l_scaleto_world(hx) #Change Plot Region to Display All Plot Data
    l_configure(hy, x = newy,
                binwidth = binwidthy,
                xlabel = ylabel)
    l_scaleto_world(hy)
  }
 # add the update functions to the sliders
  tkconfigure(sx, commmand = update)
  tkconfigure(sy, commmand = update)
  invisbile(p)
}

categorical data overview

library(loon)
areas <- unique(olive[,"Area"])
nAreas <- length(areas)
counts <- rep(0, nAreas)
regions <- c()
# construct the dataset
for (i in 1:nAreas) {
  counts[i] <- sum(olive[,"Area"] == areas[i])
  regions <- c(regions, unique(olive[olive[,"Area"] == areas[i], "Region"]))
}
# turn regions into a factor with the proper names
regions <- as.factor(levels(olive[,"Region"])[regions])
# our constructed dataset of counts
oliveOilCounts <- data.frame(Region = regions, Area = areas, count= counts)

library(treemap)
treemap(oliveOilCounts, index = c("Region", "Area"), vSize = "count") # vsize could be any variate




# could construct one interactively
itreemap(oliveOilCounts, index = c("Region"), vSize = "count")



# another package that based on ggplot2 for graphics and employs an algorithm which
# tries to achive much squarer boxes

library(treemapify)
library(ggplot2)
treeMapPlot <- ggplot(oliveOilCounts, aes(area = count, fill = Region, # colour by region
                                          label = Area, subgroup = Region))
treeMapPlot + geom_treemap() + geom_treemap_text() +
  geom_treemap_subgroup_border() + geom_treemap_subgroup_text()


# alternatively, the coordinates of the squares can be had via the treemapify(...)
# function
library(treemapify)
# alternatively
treeMapCoordinates <- treemapify(oliveOilCounts, area = "count",
                                   subgroup = "Region")
head(treeMapCoordinates)




# non-nested data - venneuler package
library(venneuler)  # 姐，这个我不准备run了，因为这个要下载java里的jdk， 还有rstudio
# 里的rjava和vennecular， 我不想下载了
ve <- venneuler(c(A=1, B=1, "A&B" = 1))
plot(ve)

HairEye <- apply(HairEyeColor, 1:2, sum)
library(eikosograms)
eikos(y = "Hair", x = "Eye", data = HairEye, xaxs = FALSE, yaxs = FALSE)

categorical data

#######################################################
# categorical data
#######################################################
city <- c("Kitchener", "Waterloo")
housing <- c("House", "Apartment", "Residence")
# fake data
counts <- rpois(6, lambda = 50)
# arranged in a rectangular array
vacancy <- matrix(counts, nrow = length(city), ncol = length(housing),
                  byrow = TRUE, dimnames = list(city = city, housing = housing))
# coereced to be an object of class table
vacancy <- as.table(vacancy)
vacancy
# creating a multi-way table
term <- c("Fall", "Winter", "Spring")
# more fake counts
counts <- seq(from = 10, to = 180, by = 10)
vacancy <- array(counts,
                 dim = c(length(city), length(housing), length(term)),
                 dimnames = list(city = city, housing = housing, term = term))
vacancy <- as.table(vacancy)



# reordering variates in tables
# the array dimension permutation function aperm(..)
aperm(vacancy, perm = c(3,2,1))



# construting tables from data
library(loon.data)
data("SAheart", package = "loon.data")
table(SAheart$chd, SAheart$famhist, dnn = c("chd", "famhist"))
# by cross-tabulation ("cross tabs or xtabs)
xtabs(~chd + famhist, data = SAheart)


# collapsing dimensions (marginalizing, projecting)
# zero dimensional
margin.table(HairEyeColor)

# 1 dimensional
margin.table(HairEyeColor, margin = 1)
# 2 dimensioanl -- here margins 1 and 2 ("hair", "eye") are preserved
margin.table(HairEyeColor, margin = c(1,2))

# note: except for 0 dimensional. there are the same as using apply with sum
apply(HairEyeColor, MARGIN = 1, FUN = sum)


# adding sums to a table
# every margin is summed
addmargins(HairEyeColor)


# just producing marginal sums over dimension 2("eye") values
# for each pair(i,k) pf remaining variates "hair" and "sex
addmargins(HairEyeColor, margin = 2)


# producing marginal sums over both dimensions 1 and 2 ("hair", and "eye")
addmargins(HairEyeColor, margin = c(1,2))


# sums to proportions
# no margins fixed, just total.. single multinomial
round(prop.table(HairEyeColor), 3)

# one margin is fiexed, (the theird here, i.e. Sex) is fixed,... as may multinomial
#  as in
round(prop.table(HairEyeColor, margin = 3),2)
# probability models from tables
# easier to see with sums for a two way table
HairEye <- margin.table(HairEyeColor, margin = c(1,2))
# sum of each table's proportions must be one
round(prop.table(HairEye, margin = 2), 2)


library(tidytable) # provide an implementation of the rules we developed
# for table analysis

tidytable(HairEyeColor)$table

proportions <- round(prop.table(HairEye, margin = 2), 2)
tidyproportions <- tidytable(proportions)
tidyproportions$proportions

categorical modelling

#######################################################
# categorical modelling - modelling and independence
#######################################################
HairEye <- margin.table(HairEyeColor, margin = c(1,2))
total <- margin.table(HairEye)
colsums <- margin.table(HairEye, margin = 2)
colsums
rowsums <- margin.table(HairEye, margin = 1)
rowsums
rowProbs <- rowsums / total
rowProbs
# Draw one sample for column 1
rmultinom(n = 1, size = colsums[1], prob = rowProbs)

# the function that generates a new table for fixed column totals
#   and given row probabilities
generateCondTable <- function(columnSums = seq(100, 300, 100),
                              rowProbs = seq(0.1,0.4,0.1),
                              dnn = NULL) {
  if (is.null(dnn)) {
    dnn <- list(Y = paste0("y_", 1:length(rowProbs)),
               X = paste0("x_", 1:length(columnSums))
               )
  }
 nRows <- length(rowProbs)
 nCols <- length(columnSums)
 result <- matrix(nrow = nRows, ncol = nCols)
 for (j in 1:nCols){
   result[,j] <- rmultinom(n = 1, size = columnSums[j], prob = rowProbs)
 }
 result <- as.table(result)
 dimnames(result) <- dnn
 result
}



# unconditional - NO fixed margin
generateUncondTable <- function(total = 1000, colProbs = seq(0.1, 0.4, 0.1),
                                rowProbs = seq(0.4, 0.1, -0.1), dnn = NULL){
  colSums <- rmultinom(n=1, size = total, prob = colProbs)
  generateCondTable(columnSums = colSums, rowProbs = rowProbs, dnn = dnn)
}

# generating a new table under the hypothesis of independence , from an existing
# table of counts
generateTable <- function(Table, y, x, conditionalTest = TRUE) {
  new_table <- margin.table(Table, margin = c(y,x))
  #get the marginal probabilities for the response variates
  rowSums <- margin.table(new_table, margin = y)
  rowProbs <- prop.table(rowSums)
  # get the marginal probailities for the conditioning variate
  colSums <- margin.table(new_table, margin = x)
  colProbs <- prop.table(colSums)
  # get the dnn
  dnn <- dimnames(new_table)
  if (conditionalTest){
    generateCondTable(columnSums = colSums, rowProbs = rowProbs,
                      dnn = dnn)
  } else {
    generateUncondTable(total = sum(new_table),
                        colProbs = colProbs,
                        rowProbs = rowProbs,
                        dnn = dnn)
  }
}

# need to functions of the single argument data which will geenrate new data under
# the hypothesis
generateTableDataCond <- function(data){
  newtable <- generateTable(data$table, y = data$y, x = data$x,
                            conditionalTest = TRUE)
  list(table = newtable, y= data$y, x = data$x)
}

generateTableDataUncond <- function(data){
  newtable <- generateTable(data$table, y = data$y, x = data$x,
                            conditionalTest = FALSE)
  list(table = newtable, y= data$y, x = data$x)
}


# the function that will show an eikosogram
showTable <- function(data, Suspect, draw = FALSE){
  result <- eikos(y =data$y, x = data$x,
                  data = data$table, legend = FALSE,
                  xlabs = FALSE, ylabs = FALSE,
                  xaxs = FALSE, yaxs = FALSE,
                  main = paste("Suspect", Suspect),
                  draw = draw) # default don't draw
  invisible(result)
}


# since the eikos function is implemented using grid, need to update the lineup function

lineup1 <- function(data, showSuspect = NULL, generateSuspect = NUll,
                   trueLoc = NULL, layout = c(5,4),
                   pkg = c("Graphics", "grid", "ggplot2")){
  # get the number of suspects in total
  nSuspects <- layout[1] *layout[2]
  if (is.null(trueLoc)) {trueLoc <- sample(1:nSuspects, 1)}
  if (is.null(showSuspect)) {stop("need a plot function for the subject")}
  if (is.null(generateSuspect)) { stop("need a function to generate subject")}

  # need to decide which subject to present
  presentSuspect <- function(suspectNo){
    if (SuspectNo != trueLoc) {data <- generateSuspect(data)}
    showSuspect(data, suspectNo)}

 # plotting depends on the plotting package
  pkg <- match.arg(pkg) # Argument Verification Using Partial Matching
  switch(pkg,
         "graphics" = {
           savePar <- par(mfrow = layout,
                          mar = c(2.5,0.1,3,0.1),
                          oma = rep(0,4))
           sapply(1:nSuspects, FUN = presentSuspect)
           # the plot layout here is of no value for graphics
           plotLayput <- layout
           par(savePar)
         },
         "grid" = {
           grobs <- lapply(1:nSuspects, FUN = presentSuspect)
           plotLayout <- arrangeGrob(grobs = plots, # Arrange multiple grobs on a page
                                     nrow = layout[1], ncol = layout[2])
         },
         "ggplot2" = {
           plots <- lapply(1:nSuspects, FUN = presentSuspect)
           plotLayout <- arrangeGrob(grobs = plots,
                                     nrow = layput[1], ncol = layout[2])
         },
         stop("Wrong pkg"))
  # return obfuscated location and plot (if not "graphics)
  list(trueLoc = hideLocation(trueLoc, nSuspects),
       plotLayout = plotLayout)
}

results <- lineup(..., pkg = "grid",...) # must supple other args
grid.arrange(results$plotLayout) # display the lineup
revealLocation(results$trueLoc) # reveal the true location

library(eikosograms); library(gridExtra)
results <- lineup1(data,
                  generateSuspect = generateTableDataCond,
                  showSuspect = showTable,
                  layout = c(5,5),
                  pkg = "grid")

# display equivalence in eikosograms
titanicsurvclass <- margin.table(Titanic, margin = c(1,4))
mosaicplot(titanicsurvclass, main = "MOsaic plot (two way table")

# spine plot
spineplot(titanicsurvclass, main = "spineplot (two way table")



# eikosograms and mosaic plots differ when there are more than two variates
eikos(y = "Survived", x = c("Class", "Sex"),
      data = Titanic, xaxs = FALSE, yaxs = FALSE,
      main = "Eikosogram (three way table)")

# mosaic plot, need to construct the marginal table first,
# note that the table is ordered so that survived is first,
# then class and sex (to match the eikosogram order above)
titanicsurvclasssex <- margin.table(Titanic, c(4,1,2))

# the mosaic plot
mosaicplot(titanicsurvclasssex, main = "mosaic plot ( three way table")

onmaps

#######################################################
# onmaps
#######################################################
library(maps)
savePar <- par(mfrow = c(1,2))
# first the map from the maps package
map("world", "Canada", xlim = c(-141, -53), ylim = c(40, 85),
    col = "grey90", fill= TRUE)
title(main = "maps package")
# or use the map from the mapdata package
library(mapdata)
map("worldHires", "canada", xlim = c(-141, -53), ylim = c(40, 85),
        col = "grey90", fill = TRUE)
title(main = "mapdata package")
par(savePar)

# annual precipitation as thermometer glyphs
library(maps)
precip <- read.csv("/Users/xuanxiaowu/Desktop/stat442/rcode/precip.csv")
# 这个csv可以在网上找到， 如果你想要的话我给你email， 或者等过几天我学一下github

n <- nrow(precip)
lat <- vector("numeric", n)
pop <- vector("numeric", n)
long <- vector("numeric", n)
for (i in 1:n) {
  city <- precip[i, "name"]
  selected.city <- canada.cities[,"name"] == city
  lat[i] <- canada.cities[selected.city, "lat"]
  long[i] <- canada.cities[selected.city, "long"]
  pop[i] <- canada.cities[selected.city, "pop"]
}

cities <- data.frame(precip, lat = lat, long= long, pop = pop)
# take the subset
indices <- c(5,13,18, 20, 24, 26, 27, 29, 32)
cities <- cities[indices, ]
map("world", "Canada", plot = TRUE, fill = FALSE)
symbols(cities$long - 1, cities$lat, inches = FALSE,
        thermometers = cbind(rep(2, nrow(cities)),
                             rep(4, nrow(cities)),
                             (cities$precip)/2000), # precip in mm
        fg = "red",
        add = TRUE)


# statiscal glyphs - any kind of graphic
vars <- c("days", "precip.mm.", "pop")
nVars <- length(vars)
ncities <- nrow(cities)
cityVals <- cities[,vars]
maxVals <- apply(cityVals, 2,max)
# get values as proportions of each column
stdCities <- sweep(cityVals, 2, maxVals,`/`) # sweep: Return an array obtained
              #from an input array by sweeping out a summary statistic.
stdCitiesList <- Map(function(rowNum) {t(stdCities[rowNum,])},
                     1:ncities) # apply a function to each element of a vector
# in the loon package, the function l_make_glyphys() will create glyphs from any
# statiscal graphic
library(loon)
# create the glyphs for each city
my_glyphs <- l_make_glyphs(stdCitiesList,
                           draw_fun = function(city){
                             par(mar = rep(1,4))
                             barplot(height = city,
                                     beside = TRUE,
                                     axes = FALSE,
                                     col = rainbow(nVars),
                                     axisnames = FALSE,
                                     ylim = c(0,1))
                           }, width = 100, height = 100)
# can view the glyphs
l_imageviewer(my_glyphs)

# these can be added as points symbols to any l_plot
# plot city locations
p <- l_plot(cities$long, cities$lat,
            xlabel = "longitude",
            ylabel = "laitude",
            showLabels = FALSE)
# add the map
Canada <- map("world", "Canada", fill = TRUE, plot = FALSE)
l_map <- l_layer(p, Canada, asSingleLayer = TRUE, color = "cornsilk")
l_scaleto_world(p) #Change Plot Region to Display All Plot Data
# then add the images as glyphs called "barplot"
g <- l_glyph_add_image(p, my_glyphs, "barplot")
p['glyph'] <- g

library(loon)
theEast <- cities$long > -80
p["selected"] <- theEast

# readShapeSpatial {maptools}	R Documentation
#       Read shape files into Spatial*DataFrame objects


# rm(list = ls()): rm will remove all of the objects that are stored in
# your global environment (which may be what you want) but will not unload any
#   of the packages that you have loaded. You can do both by restarting your
# R session in RStudio with the keyboard shortcut Ctrl+Shift+F10 which will
# totally clear your global environment of both objects and loaded packages.
#  use graphic.off() to clear all the plots you have generated

three dimensions

#######################################################
# three dimensions
#######################################################
library(MASS)
library(ggplot2)
# use square grids, count the number of points in each cell
# code count to saturation. here use geom_bin2d() from ggplot2
p <- ggplot(geyser, aes(x = duration, y = waiting))
h <- p + geom_bin2d(bins = 15)
h
# to switch the color around
library(RColorBrewer)
# brew a sequence of blues
blues <- brewer.pal(9, "Blues")
darkerBlues <- blues[-(1:2)] # remove the two lightest blues
h + scale_fill_gradientn(colours = darkerBlues)


# use hexbin for hexagonal binning
library("hexbin")
bins <- hexbin(x = geyser$duration, y = geyser$waiting, xbins = 15)
plot(bins, main = "hexagonal bin histogram")
# alternatively we cna use the hexbinplot directly
hexbinplot(waiting~duration,data = geyser, xbins = 15,
           colramp = function(n) rev(gray.colors(n)), # rev:Reverse Elements
           aspect = 1,
           main = "Hexagonal bin histogram")

#  or we could use size instead of the colour to indicate counts
hexbinplot(waiting~duration, data = geyser,
           style = "lattice", aspect = 1,
           main = "hexagonal bin histogram")
# both colour and size are used
hexbinplot(waiting~duration, data = geyser,
           style = "nested.lattice", aspect = 1,
           main = "hexagonal bin histogram")


# density estimation in two-dimensions
# calculate the contours of the kernel function
nxvals <- 100
nyvals <- 100
x <- seq(-5, 5,length.out = nxvals)
y <- seq(-5, 5,length.out = nyvals)
gridLocs <- expand.grid(x,y)
# now have the indices to match the locations
indices <- cbind(rep(1:nxvals, times = nyvals), rep(1:nyvals, each = nxvals))
# a matrix to store away the kernel values
kernel <- matrix(0, nrow = nxvals, ncol=nyvals)

# need multivariate nromal densities from mvtnorm
library('mvtnorm')
# bivaraite normal density
# (default is mean 0, and var-cov matrix the 2x2 identity matrix)
kernel[indices] <- dmvnorm(gridLocs)
#there is a contour function to showmlevel curves
contour(x=x, y=y, z= kernel, asp = 1,
        xlim = c(-2,2), ylim = c(-2,2),
        col = "black", lty = 2,
        main = "contours of a standard normal")
points(0,0, pch = 19, col = "steelblue")




# we can use this bivariate density as a kernel for bivariate density estimation
# a function that performs the density estimation is kde2d() from the mass package
x <- geyser$duration
y <- geyser$waiting
den <- kde2d(x=x, y = y , n = 100)
str(den)

# now plot the density estimates as contours
zlim <- range(den$z)
plot(x,y,pch =19,
     col = adjustcolor("steelblue", 0.5))
contour(den$x, den$y, den$z, col = "grey10",
        levels = pretty(zlim, 10), lwd = 1, add = TRUE)
# pretty :Compute a sequence of about n+1 equally spaced ‘round’ values
# which cover the range of the values in x. The values are chosen so that
# they are 1, 2 or 5 times a power of 10.

# could add colour images of density
library(RColorBrewer)
blues <- brewer.pal(9, "Blues")
image(den$x, den$y, den$z, col = blues, xlab = "duration",
      ylab = "waiting", main = "rectangles coloured by density")


# draw the density in three dimensions
# using the perspective "wire frame" drawings
lineCol <- "steelblue"
fillCol <- "cornsilk"
savePar <- par(mfrow = c(1,2))
# 3d surface
persp(den$z, border = lineCol, col = fillCol,
      xlab = "duration", ylab = "waiting time",
      zlab = "density", main = "geyser density estimate",
      sub = "solid")

# see through
persp(den$z, border = lineCol, col = adjustcolor(fillCol,0.5),
      xlab = "duration", ylab = "waiting time",
      zlab = "density", main = "geyser density estimate",
      sub = "transparent")

par(savePar)

# can change the viewing angles
theta.val <- 0   # horizontal moving right to the left
phi.val <- 0 # angle of tilt toward viewer
lineCol <- "steelblue"
fillCol <- "cornsilk"
persp(den$z, border = lineCol, col = fillCol,
      xlab = "x", ylab = "y", zlab = "z",
      # some magic below:
      # bquote: allows partial substitution;
      # terms wrapped in .() are evaluated
      main = bquote(paste(phi, "=", .(phi.val),";",
                          thera,"=",.(theta.val))),
      theta = theta.val,
      phi = phi.val)
# radius r, the distance of the eyepoint from the centre of the box
# changing the radius (think of points at infinity)
persp(den, border = lineCol, col = fillCol,
      r = 0.5, phi = 0, xlab = "x", main = "r = 0.5")# the smaller the r, the sooner the parallel lines meet

# the lattice package could be used
library("lattice")
wireframe(den$z, xlab = "duration", ylab = "waiting", zlab = "density",
          col = "grey50")

# can change the relative dimensions of the box
wireframe(den$z, xlab = "duration", ylab = "waiting", zlab = "density",
          col = "grey50", aspect = c(1.2, 0.4))

library(loon.data); data(SAheart)
cloud(sbp~ldl + tobacco, data = SAheart)

# change the colour, point symbol, remove perspective and the frame box
cloud(sbp~ldl + tobacco, data = SAheart,
      col = adjustcolor("grey10", 0.3),
      pch = 19, perspective = FALSE,
      par.settings = list(axis.line=list(col = NA)))


library(gridExtra)
cloud1 <- cloud(sbp~ldl + tobacco, data = SAheart,
           col = adjustcolor("grey10", 0.3),
           pch = 19, perspective = TRUE,
           par.settings = list(axis.line=list(col = NA)),
           # rotate (in degrees) about each axis
           screen = list(z= 20, x = -70, y = 0),
           xlab = "", ylab = "", zlab = "")
# rotate about y by 3 degrees
cloud2 <- cloud(sbp~ldl + tobacco, data = SAheart,
                col = adjustcolor("grey10", 0.3),
                pch = 19, perspective = TRUE,
                par.settings = list(axis.line=list(col = NA)),
                # rotate (in degrees) about each axis
                screen = list(z= 20, x = -70, y = 3),
                xlab = "", ylab = "", zlab = "")
# stereo display
grid.arrange(cloud1, cloud2, nrow =1)



# use the rgl package
library(rgl)
solidcol <- "steelblue"
persp3d(den$z, col = solidcol)
        # with a three button mouse:
        #left buttom  - rotate
        #middle button - change perspective
        # right button - size)
    # get rid of perspective, make things orthogonal
    # FOV(or field of view) is "the degee of parallax"
    # FOV = 0, is orthogonal (or isometric) projection.
par3d(FOV = 0)
persp3d(den$z, col = "cornsilk")



# interacitve three dimensional plots
# surfaces and planes
x <- den$x
y <- den$y        
z <- den$z
# put the coordinates in a unit cube
myscale <- function(x){
  xrange <- range(x)
  (x - min(xrange))/diff(xrange)
}
x <- myscale(x)
y <- myscale(y)
z <- myscale(z)

surface3d(x,y,z, color = solidcol)
# we could ass an arbitrary plane as well
# say the plane a*x + b*y + c*z + d = 0
planes3d(0,0,1, -median(z), col = "firebrick", alpha =0.5)
planes3d(0,0,1, -quantile(z, 0.75), col = "steelblue", alpha =0.5)

# a point in the centre of the box is
centre <- c(median(x), median(y), median(z) )
# a plane's orientation is given by its normal vector
a <- 1
b <- 1
c <- 1
normal <- c(a,b,c)
# get d to put this through the centre
d <- - t(normal) %*% centre
planes3d(a,b,c,d, col = "purple", alpha= 0.7)

marginal contribution

#######################################################
# marginal contribution - projections and slices
#######################################################       



# distance determines the amount of perspective ,default is 0.2
cloud <- cloud(obesity~alcohol + tobacco, data = SAheart,
                distance = 1, perspective = TRUE,
                par.settings = list(axis.line=list(col = NA)),
                # rotate (in degrees) about each axis
                screen = list(z= 0, x = 10, y = 0),
                xlab = "x", ylab = "y", zlab = "z")

# no perspective = marginal = orthogonal projection


# three dimensional scatterplot - interactive
library(loon)
cols <- rep("firebrick", length(SAheart$chd))
cols[SAheart$chd == "Yes"] <- "steelblue"
# need a new data frame
newdata <- data.frame(sbp = SAheart$sbp, ldl = SAheart$ldl,
                      tob = SAheart$tobacco)
# create the navigation graph
l_navgraph(data = newdata,
           # optional arguements follow
           color = cols, glyph = "ccircle", showGuides = TRUE)

# a free hand rotation in loon
with(l_scale3D(newdata),
     l_plot3D(x = sbp, y = ldl, z = tob, color = cols,
              showGuides = TRUE, glyph = "ccircle"))


# conditioninh on one or more variate values
# beginning with a density estimate for sbp
library(ggplot2)
ggplot(data= SAheart, mapping = aes(sbp)) +
         geom_density(fill = "steelblue", alpha =0.5)


# now condition on family history - by color
p <- ggplot(data= SAheart, mapping = aes(sbp)) +
  geom_density(mapping = aes(fill = famhist), alpha = 0.5)


# now condition on family history - by locations and color
p + facet_grid(famhist~.)
p + facet_grid(.~famhist)

# now condition on faimly history (colour and location) and value of chd (location)
p + facet_grid(chd ~ famhist, labeller = label_both)

# a two variates display conditional on the value of a third,
p <- ggplot(data = SAheart, mapping = aes(ldl, sbp))
p + geom_point(color = "steelblue")
# condition on famhist(colour)
p + geom_point(mapping = aes(color = famhist))
# conditional on famhist (colour+location)
p + geom_point(mapping = aes(color = famhist)) + facet_grid(.~famhist)
# conditional on famhist (colour + location) - densities
p + geom_density_2d() +
  facet_grid(.~famhist, labeller = label_both)

# conditional on a continuous variate, say tobacco - colour
p + geom_point(mapping = aes(color = tobacco), size = 2, alpha = 0.5)




# conditional on a continuous variate, say tobacco -size
p + geom_point(mapping = aes(size = tobacco), color = "steelblue", alpha = 0.5)


# cut_interval(): make n groups having equal range
# cut_number(): makes n groups having euqal numbers of observations
# cut_width(): makes however many groups of constant width

# conditioning on one or more variate values
p + geom_density2d(color = "steelblue") +
  facet_grid(.~cut_number(tobacco, n = 3))

p + geom_point(color = "steelblue",alpha =0.5) +
  facet_grid(.~cut_number(tobacco, n = 3))

# turn an ggplot into an interactive loon plot
# 这个要额外下载package， 网站https://cran.r-project.org/web/packages/loon.ggplot/index.html
gg_p <- p + geom_density2d(color = "steelblue") + geom_point() +
  facet_grid(.~cut_number(tobacco, n =3))
myplot <- loon.ggplot(gg_p, linkingGroup = "SAheart")
# marginalization = orthogonal projection


p2 <- l_plot(SAheart$tobacco, jitter(rep(1, length(SAheart$tobacco))),
             linkingGroup = "SAheart")
p3D <- l_plot3D(l_scale3D(SAheart[,c("obesity", "tobacco", "alcohol")]),
                linkingGroup = "SAheart")

noncartesian


#######################################################
# noncartesian 1 - beyong three dimensions
#######################################################       
data <- iris[,1:4] # the four measurements on Anderson's irises
scale01 <- function(x) {
  (x - min(x))/diff(range(x) + 1)
}

data <- apply(data, 2, scale01)
library(TeachingDemos)
faces2(data[c(1,51,101), ],
       which = c(8,13,14,17), # select features
       scale = "none", nrows = 1, ncols = 3)


# with the appropriate packages, we can explore how variables these features are:
library(tkrplot)
if(interactive()){
  tke2 <- rep(list(
                list('slider',
                     from = 0, to = 1, init = 0.5,
                     resolution = 0.1)),
              18)
  names(tke2) <- c('CenterWidth','TopBottomWidth','FaceHeight','TopWidth',
                   'BottomWidth','NoseLength','MouthHeight','MouthCurve',
                   'MouthWidth','EyesHeight','EyesBetween','EyeAngle',
                   'EyeShape','EyeSize','EyeballPos','EyebrowHeight',
                   'EyebrowAngle','EyebrowWidth')
  tkfun2 <- function(...){
    tmpmat <- rbind(Min = 0, Adjust = unlist(list(...)), Max = 1)
    faces2(tmpmat, scale = 'none')
  }
  tkexamp(tkfun2, list(tke2), plotloc = 'left',
          hscale = 2, vscale = 2)
    # Create Tk dialog boxes with controls to show examples of
  # changing parameters on a graph.
}

n <- nrow(data)
faces2(data[sample(1:n, n, replace = FALSE),],
            which = c(6,7,8,12),
            scale = "columns",
            nrow = 10, ncol = 15)

# a function that in R called stars(..), which will produce Nightingale's rose style glyphs
n <- nrow(data)
stars(data[sample(1:n, n, replace= FALSE), ],
      nrow =10, ncol = 15,
      main = "Nightingales Rose (almost)",
      draw.segments = TRUE, cex = 1.2)
# data explorations using stars(..) again
stars(data, locations =data[,1:2],
      lwd = 0.5, # narrow the line
      len = 0.05,
      labels = NULL, radius = TRUE,
      col.lines = rep(adjustcolor("steelblue", 0.5), n),
      col.stars = rep(adjustcolor("steelblue", 0.5), n),
      nrow = 10, ncol = 15,
      key.loc = c(12,-0.5),
      main = "iris data"
      )

Tableau

ctrl + shift + B -> 放大workplace
ctrl + B -> 缩小workplace

Join和blend

Join:在data source中点击要join的sheet,看到sheet的信息后注意点击中上方表的小长方块,然后拖动要join的另一个table,中间会出现两个重合的圆,来调节join的类型.

Blending是特别版本的join,join只能join同类型的data,但blending可以join不同版本的,比如excel join到sql dataset.
Join时会说left和right table,blending中叫做primary data source和secondary data source.

当Blending时有两个相同的key.
Blending will result certain level of data aggregation(sum, avg, etc…)
default: sum相加
如果是无法sum的char值,会返回星号*.
如果Blending时没有相同的key,会返回null.

Scattor plot

我们发现scattor plot需要三个值.两个负责建立坐标(columns + rows)还有一个是数值,导入marks里面.

比如column:transation + rows:amount + marks:customer_id
customer_id作为每个customer的信息导入.

关于出现的outlier:删掉或对column/row进行对数(log)转换.

对数转换:使用calculate field中的sum和log function.

按子类别计算更多的detail:

增加calculated field:
{ fixed[Sub category]: avg([Amount]) }
并增加在mark中的size上.
以及因为是value的计算,tableau会自动求和,但我们不需要求整个sub category的sum,所以得改成avg.

Clusters:在analytics的区域,给scattor plot分不同的群体.
把在marks中生成的cluster可以拖入会data里的string value区域,可以保留并用在其他的图中.

Bar chart

Bar chart需要一个分组作为rows(刚刚的cluster group),以及一个数值(amount)作为值(marks).然后点击右上方的show me中的bar chart.
How many ppl in a cluster group, add calculated field:
CountD([Customer ID])

我们可以做一个两头bar chart:
将distinct customer ID也作为column值,做一个左右并排的bar chart.再选中其中的一个右键选取edit axis并reversed.

在最上方的工具栏里有swap rows and columns很好用.

Dashboard

将chart(sheet)集合在一起的地方,还能加action,实时比较不同category下的分析数据,很牛逼.
例如:将某个category base的bar chart右键use as filter,那么点击某个category,其他的图标就会计算那个category下的数据.

正常流程:在dashboard tool bar中添加action,选择highlight就只会high light,不会把其他的数据隐藏.
source sheet:用于选取的category base的sheet.

在查看图表每个数值的时候可以修改tooltip整理每个数值的标签.
可以将其他想要加入的数据添加到marks的detail里,再在tooltip中的insert加入.

Marks上面的filters可以用category类的值来选择要研究的数据.

常用普通表格之colouring.
可以将measure value按住ctrl拖入color(复制一份使用colour),然后选择square,表格上就会按照数据的大小决定颜色深浅.
如果有多列的数据,在colour中有use separate legend可以分别辨识颜色深浅.

Underlake Reading Room

关于stat的笔记 - visualization

R