0%

关于stat的笔记 - visualization

朋友友情赞助的stat442的笔记加一点点点tableau(坑了)的笔记.

加上另一位朋友友情赞助的笔记与dataset.




R

NumbersOrder

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#library(aplpack)
# stem(marks, scale = 0.5, width = 100)
# the defalut in separated each group from 1-4 and 5-9. see page 7.
# stem.leaf(marks)

# the value of ordered values
wordLeaders <- c(62,65,65,65,66,67,68,72,73,73)
n <- length(wordLeaders)
p <- ppoints(n)
plot(x = p, y = wordLeaders, type = "o", lwd = 2,
col = "grey10", xlab = "proportions", ylab = "height in inches",
main = "heights of world leaders")
# n <- length(marks)
# p <- ppoints(n)
# plot(x = p, y = sort(marks), type = "o", lwd = 2, col = "grey10",
# xlab = "proportion p", xlim = c(0,1), ylim = c(0, 100),
# main = "empirical qunatile function for the final grades")

# quantile plot for marks data
data("sittingHeights", package = "qqtest")
with(sittingHeights,
plot(percentCumulative, upperBound, type = "o", lwd = 2,
xlim = c(0,100), ylim = c(27, 38),
xlab = "percentage of women",
ylab = "sitting heights in inches"))



# air quality measurements
qvals <- sort(airquality$Solar.R)
n <- length(qvals)
pvals <- ppoints(n)

plot(pvals, qvals,
main = "solar radiation in central park",
xlab = "cumulative proportion",
ylab = "radiation (Langleys)")
# comments : very dense at the bottom, but not on the top

# old faithful geyser
data("geyser", package = "MASS")
qvals <- sort(geyser$duration)
n <- length(qvals)
pvals <- ppoints(n)
plot(pvals, qvals, cex = 1.5,
main = "quantile plot of old faithful eruptions",
xlab = "cumulative proportion",
ylab = "duration (minutes)")

Histograms

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
drawbox <- function(x,y,...){
xvals <- c(x, reve(x), x[1])
yvals <- c(rep(y, each = 2), y[1])
lines(xvals, yvals, ...)
}

# the emprical cumulative distirbution
plot(x = sort(marks), y = ppoints(length(marks)), type ="o",
col = "grey10", xlab = "final grade",
ylab = "proportion", main = "empirical distribution function")

# in r, the function ecdf() produces the empirical distribution function from
# any collection of numbers
Fhat <- ecdf(marks)
plot(Fhat,type ="o",
col = "grey10", xlab = "final grade",
ylab = "proportion", main = "empirical distribution function")


# histogram
data("geyser", package = "MASS")
plot_colour <- "grey50"
with(geyser,
hist(duration, col = plot_colour, xlim = extendrange(duration),
main = "old faithful",xlab ="duration"))

# change the brak points
with(geyser,
hist(duration, col = plot_colour, xlim = extendrange(duration),
breaks = c(0,0.5,1.0,2.75,3,3.5,4,4.5,5,6.0),
main = "old faithful",xlab ="duration"))

# change the starting point in histogram
savepar <- par(mfrow = c(2,3))
with(geyser,
for (start in seq(0,0.4,0.1))
{
hist(duration,
breaks = seq(start, 6.0,0.5),
probability = TRUE,
xlim = c(0, 6),
ylim = c(0, 0.7),
main = "",
xlab = "duration")
})

par(savepar)


# interactive histograms with loon package
library(loon)
data("geyser", package = "MASS")
with(geyser,
{l_hist(duration,
yshows = "density",
showBinHandle = TRUE,
xlab = "duration of eruption (minutes)",
title = "old faithful geyser",
showScales = TRUE,
showGuides = TRUE,
linkingGroup = "old faithful"
)})

# show uncertainty due to bin locations
# blurness suggests uncertainty
data("geyser", package = "MASS")
Nstarts <- 5
start_values <- seq(0,0.4, length.out = Nstarts)
plot_colour_transparent <- adjustcolor(plot_colour, alpha.f = 1/Nstarts)
with(geyser,
for (start in start_values)
{if (start == 0)
{hist(duration,
breaks = seq(start, 6.0, 0.5),
col = plot_colour_transparent,
border = adjustcolor(plot_colour_transparent, 0.0),
probability = TRUE,
xlim = c(0,7), ylim = c(0,0.7),
main = "",xlab = "duration")

} else { #add the histogram to the existing plot
hist(duration,
breaks = seq(start, 6.0, 0.5),
col = plot_colour_transparent,
border = adjustcolor(plot_colour_transparent, 0.0),
probability = TRUE,
add = TRUE)
}})

# averaging over histograms as the bins change
hist <- hist(geyser$duration,breaks=seq(0.0,6.0,0.5),
plot = FALSE)
# str(hist) #gives the summary of the structure

# the function averages the shifted histograms
ash <- function(x, n_hists = 5, binwidth = NULL,
type = "l", xlab = "x", main = "average shifted histogram",
ylab = "averaged", lwd = 2, lty = 1, xlim, ylim,
col = "grey50", fill, ...){
xrange <- round(extendrange(x), 1)
if (is.null(binwidth)) binwidth <- diff(xrange) / 10
delta <- binwidth/n_hists
xmin <- min(xrange) - binwidth
xmax <- max(xrange) + binwidth
breaks <- seq(xmin, xmax -binwidth, binwidth)
n_breaks <- length(breaks)

## x values where density will be calculated and averaged
xvals <- seq(xmin, xmax-delta, delta)
n_xvals <- length(xvals)

# total density will be numeric, this is where we store a total density for
# each xval
density <- numeric(n_xvals)

# the loop over the histograms to get densities, add them up
# the indices of x have n_hists fewer values becasue there are
# n_hists shifts in the summing
x_indices <- seq(1,n_xvals-n_hists)
for ( i in seq(0, n_hists - 1)) {
cur_hist <- hist(x, breaks=breaks+delta*i, plot = FALSE)
density[i+x_indices] <-
# current values
density[i + x_indices] +
## add the density from the current histogram
## rep: repeatsnthe data returned by its first arguement
rep(cur_hist$density, rep(n_hists, n_breaks - 1))
}
density <- density/n_hists
# plot the averages, and check for missing arguments
if (missing(xlim)) xlim <- xrange
if (missing(ylim)) ylim <- extendrange(density)
if (missing(fill)) fill <- col
plot(rep(xvals, each = 2) + rep(c(-delta/2, delta/2), length(xvals)),
rep(density, each = 2), type = type,
xlim = xlim, ylim = ylim, xlab = xlab, ylab = ylab, main = main,
lwd = lwd, lty = lty, col = col,
bty = "n", ...)
# and fill it in
polygon(rep(xvals, each = 2) + rep(c(-delta/2, delta/2), length(xvals)),
rep(density, each = 2), col =fill)
}


savePar <- par(mfrow = c(2,3))
with(geyser,
{for(start in seq(0,0.4,0.1))
{hist(duration,
breaks=seq(start,6.0,0.5),
col = plot_colour,
probability = TRUE,
xlim = c(0,6),
ylim = c(0, 0.7),
main = "",
xlab = "duration"
)
}
ash(duration, main="",xlab = "duration", xlim = c(0,6),
ylim=c(0,0.7))
}
)
par(savePar)


# weight function, this returns the weights alone or the weights and information
# on how to draw them as a function of x
ashweights <- function(m,h,k=0){
# k is the subscript of the central bin containing x, B_k,
# m is the number of histograms
# h is the bin width in each of the m histograms being averaged
indices <- seq(1-m, m-1, 1)
wts <- 1-abs(indices)/m
if (!missing(h)){
width <- h/m
hts <- rep(c(0, wts, 0), each = 2)
bins <- c(-m, rep(c(indices, m), each = 2), m+1) * width + k
list(wts = wts, bins = bins, hts = hts)
} else {
list(wts = wts)
}
}

# when m = 4, the weights are
ashweights(m = 4)$wts

# draw the weighted function
h <- 0.5
m <- 4
ashInfo <- ashweights(m = m, h = h, k = 0)
w_hts <- ashInfo$hts
B_bins <- ashInfo$bins

plot(B_bins, w_hts, type = "l",
main = expression("ASH weight function for" ~ x %in% B[0]),
ylab = "weights", xlab = "x",
col = "firebrick")
lines(c(0,0), c(0, max(w_hts)), lty = 2)
lines(rep(h/m,2), c(0, max(w_hts)), lty = 2)
text(0.5*(h/m), 0.1, labels = expression(B[0]))

# the weight is highest for the count of x in B_0, and decreases symmetrically
# for m-1 small bins on either side

visual comparisons

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295

library(knitr)
classTable <- apply(Titanic,MARGIN = c(4,1), FUN = sum)
kable(classTable)

# number in each class is
classTotals <- apply(classTable, MARGIN = 2, FUN =sum)
classSurvival <- t(classTable["Yes", ]/classTotals)
rownames(classSurvival) <- c("survived")
kable(classSurvival)


# follw the rules for tables, a better way to present these numbers is
newTable <- 100 * round(classSurvival, 2)
newTable <- t(newTable)
# descendeing order
descendingOrder <- order(newTable, decreasing = TRUE)
newTable <- newTable[descendingOrder, ,drop = FALSE] # note drop argument
colnames(newTable) <- c("% survived")
kable(newTable, caption = "survival rates on the titanic by class")

# as lengths of bars, colour coded and labelled by class
nvals <- nrow(newTable)
col <- rainbow(nvals, alpha = 0.5)
barplot(newTable, col = col, horiz = TRUE, axes = FALSE,
names.arg = c(""), xlab = colnames(newTable))
xlocs <- cumsum(newTable)
centres <- c(xlocs[1]/2, xlocs[1:(nvals - 1)] + diff(xlocs)/2)
text(centres, 0.75, labels = rownames(newTable))

barplot(newTable, col = col, horiz = TRUE, beside = TRUE,
names.arg = c(""),
xlab = colnames(newTable),
legend.text = rownames(newTable))


# the titanic - survivors beside deaths
survivalProportions <- classTable
survivalProportions["Yes",] <- survivalProportions["Yes", ]/classTotals
survivalProportions["No",] <- survivalProportions["No", ]/classTotals
survivalCols <- adjustcolor(c("black", "grey"), 0.5)
barplot(survivalProportions, col = survivalCols,
horiz = TRUE, beside = TRUE,
xlab = "proportion of class", xlim = c(0,1))
legend("bottomright", title = "survival", fill = survivalCols,
legend = rownames(survivalProportions))

# survivor and deaths within a common frame
# there ia a framing effect which makes proportions very easy to compare
# from one end or the other
# all are along common aligned scales

barplot(survivalProportions, col = survivalCols,
horiz = TRUE, beside = FALSE,
xlab = "proportion of class", space = 0)

barplot(apply(classTable, MARGIN = 2, FUN =sum),
col = adjustcolor("steelblue", 0.5),
xlab = "class", ylab = "number of passengers")
barplot(classTable, col=survivalCols, xlab = "class", ylab = "numebr of passengers")




library(loon.data)
data("SAheart", package = "loon.data")
kable(head(SAheart))

noFamilyHistory <- SAheart[,"famhist"] == "Absent"
FamilyHistory <- SAheart[,"famhist"] == "Present"
famHIsotryCol <- adjustcolor("steelblue", 0.5)
noHistoryCol <- adjustcolor("firebrick", 0.5)
boxplot(SAheart[noFamilyHistory, "sbp"], col = famHIsotryCol,
main = "No family history", horizontal = TRUE)


# use formula notation y~x and a single data set, on common aligned scales
boxplot(sbp~famhist, data = SAheart, col = c(noHistoryCol, famHIsotryCol),
main = "systolic blood pressure", horizontal = TRUE)

library(ggplot2)
ggplot(data=SAheart, mapping = aes(x=famhist, y = sbp)) +
geom_boxplot(colour = c("firebrick", "steelblue"),
fill = c("firebrick", "steelblue"),
alpha = 0.5) +
coord_flip()


hist(SAheart[noFamilyHistory, "sbp"], col = noHistoryCol,
main = "No family history",
xlab = "systolic blood pressure",
xlim = extendrange(SAheart[,"sbp"]), ylim = c(0, 60))
# when use par(mfrow = c(2,1)) , means not on common scale
# put on common scale as follows
hist(SAheart[noFamilyHistory, "sbp"], col = noHistoryCol,
main = "No family history",
xlim = extendrange(SAheart[,"sbp"]))
hist(SAheart[FamilyHistory, "sbp"], col = famHIsotryCol,
xlim = extendrange(SAheart[,"sbp"]), add = TRUE)



# reflected histograms
xrange <- extendrange(SAheart[,"sbp"])
breaks <- seq(xrange[1], xrange[2], length.out = 12)
h1 <- hist(SAheart[noFamilyHistory,"sbp"],
breaks = breaks, plot = FALSE)
h2 <- hist(SAheart[FamilyHistory,"sbp"],
breaks = breaks, plot = FALSE)
hmax = max(c(h1$counts, h2$counts))
h2$counts <- -h2$counts
hmin = -hmax
X = c(h1$breaks, h2$breaks)
xmax <- max(X)
xmin = min(X)
plot(h1, xlab = "systolic blood pressure", main = "comapring patients with(blue) and without(pink)",
ylim = c(hmin, hmax),xlim = c(xmin, xmax),
col = noHistoryCol)
lines(h2, col = famHIsotryCol)

# back to back histograms'
yrange <- extendrange(SAheart[,"sbp"])
breaks <- seq(yrange[1], yrange[2], length.out = 12)
h1 <- hist(SAheart[noFamilyHistory,"sbp"],
breaks = breaks, plot = FALSE)
h2 <- hist(SAheart[FamilyHistory,"sbp"],
breaks = breaks, plot = FALSE)
nbreaks <- length(breaks)
hmax = max(c(h1$counts, h2$counts))
h2$counts <- -h2$counts
hmin = -hmax
Y <- rep(h1$breaks, each = 2)
X <- c(0,rep(h1$counts, each = 2), 0)
plot(rep(0,2), range(Y), type = "l", col = "black",
xlim = c(hmin, hmax), ylim = extendrange(Y),
bty = "n",
xlab = "frequency", ylab = "systolic blood pressure",
main = "comparing patients with and without")
polygon(X, Y, col = noHistoryCol)
for (i in 1:nbreaks){lines(c(0, h1$counts[i]), c(rep(h1$breaks[i+1], 2)))}
Y <- rep(h2$breaks, each = 2)
X <- c(0,rep(h2$counts, each = 2), 0)
polygon(X, Y, col = famHIsotryCol)
for(i in 1:nbreaks){lines(c(0,h2$counts[i]), c(rep(h2$breaks[i+1],2)))}



# using facet with ggplot2
library(ggplot2)
ggplot(data =SAheart, mapping = aes(x =sbp)) +
geom_histogram(bins =12,
colour = "grey50",
fill="white") +
facet_grid(famhist~.)


# what can be compared via density estimates
savepAr <- par(mfrow=c(1,2))
densAbsent <- density(SAheart[noFamilyHistory,"sbp"], bw = "SJ")
densPresent <- density(SAheart[FamilyHistory,"sbp"], bw = "SJ")

plot(densAbsent, col = "firebrick", main = "no family history")
polygon(densAbsent, col = noHistoryCol)
plot(densPresent, col = "steelblue", main = "family history")
polygon(densPresent, col = famHIsotryCol)
par(savepAr)
# note the scales are not necesssarily identical

# compare both x,y aligned x only scales
savep <- par(mfrow=c(2,1))
xlim <- extendrange(SAheart[,"sbp"])
ylim <- extendrange(c(densAbsent$y, densPresent$y))
plot(densAbsent, col = "firebrick", main = "no family history",
xlim = xlim, ylim = ylim)
polygon(densAbsent, col = noHistoryCol)
plot(densPresent, col = "steelblue",main = "family history",
xlim = xlim, ylim = ylim)
polygon(densPresent, col = famHIsotryCol)
par(savep)


# common aligned scales
plot(densAbsent, col = "firebrick", xlab = "systolic blood pressure",
main = "comparing a no family history with family history",
xlim = xlim, ylim = ylim)
polygon(densAbsent, col = noHistoryCol)
lines(densPresent, col = "steelblue")
polygon(densPresent, col = famHIsotryCol)





# common aligned scales-reflected
densPresent$y <- - densPresent$y
ylim2 <- extendrange(c(densAbsent$y, densPresent$y))
plot(densAbsent, col = "firebrick", xlab = "systolic blood pressure",
main = "comparing a no family history with family history",
xlim = xlim, ylim = ylim2)
polygon(densAbsent, col = noHistoryCol)
lines(densPresent, col = "steelblue")
polygon(densPresent, col = famHIsotryCol)



# back to back
ylim3 <- extendrange(SAheart[,"sbp"])
xlim2 <- extendrange(c(densAbsent$y, densPresent$y))
xyswitch <- function(xy_thing){
yx_thing <- xy_thing
yx_thing$x <- xy_thing$y
yx_thing$y <- xy_thing$x
yx_thing
}
plot(xyswitch(densAbsent), col = "firebrick", xlab = "Density",
ylab = "systolic blood pressure",
main = "comparing a no family history with family history",
xlim = xlim2, ylim = ylim3)
polygon(xyswitch(densAbsent), col = noHistoryCol)
lines(xyswitch(densPresent), col = "steelblue")
polygon(xyswitch(densPresent), col = famHIsotryCol)


# using facet with ggplot2
libarary(ggplot2)
ggplot(data=SAheart, mapping = aes(x=sbp, col = famhist)) +
geom_density(colour = "grey50",
fill = "black",
alpha = 0.4,
bw = "SJ") +
facet_grid(famhist~.)



# simple points
xlim <- extendrange(SAheart[,"sbp"])
n <- nrow(SAheart)
col = rep(adjustcolor("firebrick", 0.2),n)
col[FamilyHistory] <- adjustcolor("steelblue", 0.2)
Y <- rep(1,n)
Y[FamilyHistory] <- -1
plot(SAheart[,"sbp"], y = Y, col = col, pch = 19, cex = 3,
xlab = "systolic blood pressure",
main = "comparing a no family history with family history",
xlim = xlim, ylim = c(-2,2), bty = "n", yaxt = "n")

# simple points with jittering

xlim <- extendrange(SAheart[,"sbp"])
n <- nrow(SAheart)
col = rep(adjustcolor("firebrick", 0.2),n)
col[FamilyHistory] <- adjustcolor("steelblue", 0.2)
Y <- rep(1,n)
Y[FamilyHistory] <- -1
U <- runif(n, -0.3,0.3)
plot(SAheart[,"sbp"], y = Y + U, col = col, pch = 19, cex = 3,
xlab = "systolic blood pressure",
main = "comparing a no family history with family history",
xlim = xlim, ylim = c(-2,2), bty = "n", yaxt = "n")

# quantile plots
savePar <- par(mfrow = c(1,2))
nAbsent <- sum(noFamilyHistory)
nPresent <- sum(FamilyHistory)
pAbsent <- ppoints(nAbsent)
pPresent <- ppoints(nPresent)

plot(pAbsent,sort(SAheart[noFamilyHistory, "sbp"]), type = "b",
col = noHistoryCol, pch =19,
xlab = "cumulative proportion",
ylab = "systolic blood pressure",
main = "no family histroy")

plot(pPresent,sort(SAheart[FamilyHistory, "sbp"]), type = "b",
col = famHIsotryCol, pch =19,
xlab = "cumulative proportion",
ylab = "systolic blood pressure",
main = "family histroy")
par(savePar)

# quantile plot common aligned scales via overlaying
ylim <- extendrange(SAheart[,"sbp"])
nAbsent <- sum(noFamilyHistory)
nPresent <- sum(FamilyHistory)
pAbsent <- ppoints(nAbsent)
pPresent <- ppoints(nPresent)

plot(pAbsent,sort(SAheart[noFamilyHistory, "sbp"]), type = "b",
col = noHistoryCol, pch =19,
xlab = "cumulative proportion",
ylab = "systolic blood pressure",
main = "no family histroy")
points(pPresent, sort(SAheart[FamilyHistory, "sbp"]),
type = "b", col = famHIsotryCol, pch =19)

visual hypothesis testing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

# a function which would achieve this mixing is
mixRandomly <- function(data){
# note the data need not to be a data frame
# it is expected to be a list with an x and y component (possibly different lengths)
x <- data$x
y <- data$y
m <- length(x)
n <- length(y)
mix <- c(x,y)
select4x <- sample(1:(m+n),
m,
replace = FALSE)
new_x <- mix[select4x]
new_y <- mix[-select4x]
list(x = new_x, y = new_y)
}


myplot <- function(data) {
xrange <- extendrange(c(data$x, data$y))
breaks <- seq(xrange[1], xrange[2], length.out = 12)
hx <- hist(data$x, breaks = breaks, plot = FALSE)
hy = hist(data$y, breaks = breaks, plot = FALSE)
yrange <- extendrange(c(hx$counts, hy$counts))

plot(hx, main = "", axes = FALSE,
ylab = "", xlab = "", xaxt = "n", yaxt = "n",
ylim = yrange, xlim = xrange,
col = adjustcolor("firebrick", 0.4))
lines(hy, col = adjustcolor("steelblue", 0.4))
}

# apply it
library(loon.data)
data("SAheart", package = "loon.data")
Present <- SAheart[, "famhist"] == "Present"
x <- SAheart[Present, "sbp"]
y <- SAheart[!Present, "sbp"]
data = list(x= x, y=y)

savePar <- par(mfrow = c(1,2))
heads <- runif(1) > 0.5 # toss a coin for order
tails = !heads
if (heads) myplot(data) else myplot(mixRandomly(data))
if (tails) myplot(mixRandomly(data)) else myplot(data)

par(savePar)


# function to do a line up test

lineup <- function(data, showSubject = NULL, generateSubject = NUll,
trueLoc = NULL, layout = c(5,4)){
nSubjects <- layout[1] *layout[2]
if (is.null(trueLoc)) {trueLoc <- sample(1:nSubjects, 1)}
if (is.null(showSubject)) {stop("need a plot function for the subject")}
if (is.null(generateSubject)) { stop("need a function to generate subject")}

# need to decide which subject to present
presentSubject <- function(subjectNo){
if (subjectNo != trueLoc) {data <- generateSubject(data)}
showSubject(data, subjectNo)}

# this does the plotting
savePar <- par(mfrow= layout,
mar = c(2.5,0.1,0.3,0.1), oma = rep(0,4))
sapply(1:nSubjects, FUN = presentSubject)
par(savePar)

# hide the true location but return information to reconstruct it
hideLocation(trueLoc, nSubjects)
}

hideLocation <- function(trueLoc, nSubjects){
possibleBaseVals <- 3:min(2*nSubjects, 50)
# remove easy base values
possibleBaseVals <- possibleBaseVals[possibleBaseVals != 10 &
possibleBaseVals != 5]
base <- sample(possibleBaseVals, 1)
offset <- sample(5:min(5*nSubjects, 125), 1)

# return the location information (trueLoc hidden)
list(trueLoc <- paste0("log(",
base^(trueLoc +offset),
", base = ", base,
") - ", offset))
}

revealLocation <- function(hideLocation){eval(parse(text=hideLocation$trueLoc))}

# suspectLocation <- hideLocation(3,20)
# revealLocation(suspectLocation)


# have to fix myPlot to acccept a subject number
myPlot <- function(data, subjectNo) {
xrange <- extendrange(c(data$x, data$y))
breaks <- seq(xrange[1], xrange[2], length.out = 12)
hx <- hist(data$x, breaks = breaks, plot = FALSE)
hy <- hist(data$y, breaks = breaks, plot = FALSE)
yrange <- extendrange(c(hx$counts, hy$counts))

plot(hx, main = paste( subjectNo), cex.main = 1.5, line = -0.75,
ylab = "", xlab = "", xaxt = "n", yaxt = "n",
ylim = yrange, xlim = xrange,
col = adjustcolor("firebrick", 0.4))
lines(hy, col = adjustcolor("steelblue", 0.4))
}

# we need only execute the test as follows
data <- list(x = x, y=y)
lineup(data,
generateSubject = mixRandomly,
showSubject = myPlot,
layout = c(4,5))



# back to back displays
back2back <- function(data, subjectNo) {
ylim <- extendrange(c(data$x, data$y))
Xdensity <- density(data$x, bw = "SJ")
Ydensity <- density(data$y, bw = "SJ")
Ydensity$y <- - Ydensity$y
xlim <- extendrange(c(Xdensity$y, Ydensity$y))
xyswitch <- function(xy_thing){
yx_thing <- xy_thing
yx_thing$x <- xy_thing$y
yx_thing$y <- xy_thing$x
yx_thing
}
plot(xyswitch(Xdensity), col = "firebrick", xlab = "",
ylab = "",
main = paste(subjectNo),cex.main = 1.5, adj = 0.75,line = -0.85,
xaxt = "n", yaxt = "n",
xlim = xlim, ylim = ylim)
polygon(xyswitch(Xdensity), col = adjustcolor("firebrick", 0.4))
lines(xyswitch(Ydensity), col = "steelblue")
polygon(xyswitch(Ydensity), col = adjustcolor("steelblue", 0.4))

}

data <- list(x = x, y=y)
lineup(data,
generateSubject = mixRandomly,
showSubject = back2back,
layout = c(4,5))

# quantiles display
quantiles <- function(data, subjectNo){
ylim <- extendrange(c(data$x, data$y))
n_x <- length(data$x)
n_y <- length(data$y)
p_x <- ppoints(n_x)
p_y <- ppoints(n_y)

plot(p_x, sort(data$x), type = "b",
pch = 19, col = adjustcolor("firebrick", 0.4),
cex = 2, ylim = ylim,
xlab = "", ylab = "", main = paste(subjectNo),
cex.main = 1.5, adj = 0.75,line = -0.85,
xaxt = "n", yaxt = "n")
points(p_y, sort(data$y), type = "b",
col = adjustcolor("steelblue", 0.4),pch = 19,
cex = 2)
}

lineup(data,
generateSubject = mixRandomly,
showSubject = quantiles,
layout = c(4,5))

# we might use boxplot if we are only interested in comparing locations and
# possible scales and symmetry

boxplots <- function(data, subjectNo){
y <- c(data$x, data$y)
x <- c(rep(0, length(data$x)),
rep(1, length(data$y)))
boxplot(y~x, data = cbind(x,y),
col = adjustcolor(c("firebrick", "steelblue"), 0.4),
main = paste(subjectNo),
cex.main = 1.5, adj = 0.75, line = -0.75,
xlab = "", ylab = "", xaxt = "n", yaxt = "n")
}

hypothetical generative

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# this function will return x and y data that have the same median(0) and
# interquartile range(1): same slope in the middle

matchData <- function(data){
x <- data$x
y <- data$y
list (x = scale(x, center = median(x), scale=IQR(x)),
y = scale(y, center = median(y), scale=IQR(y)))
}

lineup(data,
showSubject = back2back,
layout = c(4,5),
generateSubject =
function(data) {
data <- matchData(data)
data <- mixRandomly(data)
matchData(data)
})




# compare to a specific shape
# rdist(): will generate n pseudo-random values from the hypothesized generative
# distribution (for some location and scale), then data matched on location (median)
# and scale(IQR) can be geenrated using:
getMatchData <- function(x,
rdist = rnorm,
matchScale = TRUE,
matchLocation = TRUE){
new_x <- rdist(length(x)) # geenrate new data Hershey
# now match the new data to have the same location and scale as the old data
if (matchScale) {new_x <- new_x * (IQR(x)/IQR(new_x))}
if (matchLocation) {new_x <- new_x - median(new_x) + median(x)}
new_x
}

# first, get the actual data
x <- beaver1[,"temp"]

# construct a pairing of it with a randomly geenrated sample from any Gaussian distribution
# and match the location and scale
data <- list(x = x, y = getMatchData(x))

# then this data can now be used in a line up test,
lineup(data,
showSubject = back2back,
layout = c(4,5),
generateSubject =
function(data) {
list(x = getMatchData(data$x),
y = getMatchData(data$y))
}
)

# if from other distribution, like t3
rdist <- function(n) rt(n, df = 3)
data <- list(x = x, y = getMatchData(x, rdist = rdist))

## the above method was compared to several other pairs of x and y where both were
# generated from the hyppthesized dsitribuiotn
# the extra variation introduced by random generation of both x and y under the null hypothesis might
# be making the detemination more difficult
# the variation might be reduced if we instead use a single representative set of values
# for the comparison in each graphic.


# sample quantiles vs. theoretical quantiles

library("MASS")
attach(geyser)
plot_colour <- adjustcolor("black", 0.3)
qqnorm(duration, pch = 19, col = plot_colour)
# here geyser duration data with alpha blending to help reveal point density

savePar = par(mfrow=c(1,2))
qqnorm(SAheart[noFamilyHistory, "sbp"], pch = 19,
col = adjustcolor("firebrick", 0.4),
main = "no family history")
qqnorm(SAheart[FamilyHistory, "sbp"], pch = 19,
col = adjustcolor("steelblue", 0.4),
main = "family history")
par(savePar)



# self calibrating qqplots
library(qqtest)
qqtest(duration)
qqtest(duration, xAxisAsProbs = TRUE,yAxisAsProbs = TRUE)


# ordering comparisons
library(loon.data)
data("SAheart")

# getting the comparisons of interest
# get the groups
groups <- with(SAheart, split(sbp, list(famhist, chd)))
kable(t(names(groups)))

# now ordering the groups
ord <- c("Present.No", "Present.Yes", "Absent.Yes",
"Absent.No", "Present.No")
# match the colours to the famliy history
cols <- adjustcolor(c("steelblue", "steelblue", "firebrick",
"firebrick", "steelblue"), 0.5)
boxplot(groups[ord], col = cols)


# layout via graph structure
library("PairViz")
library(loon.data)
data(cancer)
str(cancer)

# we can separate the survival times by which organ is affected
organs <- with(cancer, split(Survival, Organ))
# and record their names for use later
organNames <- names(organs)
# the structure of the organs data
str(organs)

# getting Eulerians and Hamiltonians
ord <- eulerian(5)

library(colorspace)
# getting a rainbow of colours but choose chromaticity of 50 to dull them
cols <- rainbow_hcl(5, c = 50)
boxplot(organs[ord], col = cols[ord],
ylab = "Survival time", main = "cancer treated by vitamin C")

# logout via graph structure
# taking square roots should make the data look a liitle more symmetric
sqrtOrgans <- with(cancer, split(sqrt(Survival), Organ))

boxplot(sqrtOrgans[ord], col = cols[ord],
ylab = expression(sqrt("Survival time")),
main = "cancer treated by vitamin C")

# hamiltonian decompositions
ordHam <- hpaths(5) # here each hamiltonian is a row of the matrix'
# for our purpose, we would like the hamiltonians to be joined togehter
ordHam <- hpaths(5, matrix = FALSE)



# run a statistical test comapring each pair and use the p-value
# to order. e.g. test the equality of each pair of means(assuming normlaity)
# pairwise.t.test() does this and corrects the p-values for simultaneity
test <- with(cancer,
pairwise.t.test(sqrt(Survival), Organ))
pvals <- test$p.value
round(pvals, 4)

# the pairviz package can handle graphs with weights
ngrps <- length(organNames)
# build matrix of weights
weights <- matrix(0, nrow = ngrps, ncol= ngrps)
rownames(weights) <- organNames
colnames(weights) <- organNames
weights[2:ngrps, 1:(ngrps-1)] <- pvals
round(weights, 4)
# right size matrix but need to be symmetric by filling the upper triangle to
# match lower
diag(weights) <- 0
for (i in 1:(ngrps - 1)) {
for (j in (i+1):ngrps) {
weights[i,j] <- weights[j,i]
}
}
round(weights, 4)

# greedy eulerians
# low to high
low2highEulord <- eulerian(weights);
colnames(weights)[low2highEulord]
# high to low
high2lowEulord <- eulerian(- weights)
colnames(weights)[high2lowEulord]

# weighted hamiltonian decompositions
low2higihHamord <- weighted_hpaths(weights)
# high to low
high2lowHamord <- weighted_hpaths(- weights)


# having weigthed graph g
library(Rgraphviz)
g <- mk_complete_graph(weights)
# par(mar = c(bottom, left, top, right))
par(mar = c(1, 1, 1, 1))
plot(g, "circo")



#######################################################
########### the code used to install packages in bioManager
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")

BiocManager::install("Rgraphviz")
#######################################################

# remove all edges in the top hald the edges weights
wvals <- weights[upper.tri(weights)]
# the median
q <- quantile(wvals, 0.5)
# trim the graph via the pairviz function dn_graph()
g1 <- dn_graph(g,q,`<=`)
par(mar = c(1, 1, 1, 1))
plot(g1, "circo")

# this subgraph g1 is not even (some nodes have an odd numebr of edges)
# so it must be made even by adding extra edges to get an Eulerian
g1EulOrd <- eulerian(g1)

# back to the SAheart data
library(PairViz);library(loon.data);data("SAheart")
# building the graph
nodes <- names(groups)
SAheartGraph <- new("graphNEL", nodes = nodes, edgemode = "undirected")
SAheartGraph <- addEdge("Present.Yes", "Absent.No", SAheartGraph)
SAheartGraph <- addEdge("Present.Yes", "Absent.Yes", SAheartGraph)
SAheartGraph <- addEdge("Absent.Yes", "Absent.No", SAheartGraph)
SAheartGraph <- addEdge("Absent.No", "Present.No", SAheartGraph)

ord <-eulerian(SAheartGraph)
cols <- adjustcolor(rep(c("firebrick", "steelblue"), 2), 0.5)
names(cols) <- nodes
savePar <- par(mfrow=c(1,2), oma = rep(6,4))
boxplot(groups[ord], col= cols[ord], main = "family hisotry and chd")
plot(SAheartGraph, "circo")
par(savePar)

time order

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# time series data
library(loon)
p <- l_plot(sunspot.month, color = "grey", size = 1,
showScales = TRUE)
# connect the dots
l_layer_line(p, x = time(sunspot.month), y = sunspot.month,
color = "grey", linewidth = 2,
index = "end")

l_layer_hide(p, "model")

# show a window of a couple of years
window(sunspot.month, start=c(1960, 1), end = c(1961, 12))
end(sunspot.month)
start(sunspot.month)


# cycle: group the data by season or quarterly phase by using the cycle() function
window(cycle(sunspot.month), start = c(1960, 1), end = c(1960, 12))
quarter <- 1 + ((cycle(sunspot.month) - 1) %/% 3)
monthplot(sunspot.month, col = "grey50", col.base = "red",
lwd.base = 3, xlab = "quarter", phase = quarter,
labels = c("first", "second", "third", "fourth"))
# we might explore how well the value of one month might be predicted by the immediately
# prior month, that is how well might any value be predicted from knowing
# only its immediate predecessor
plot(x = sunspot.month[-length(sunspot.month)], y = sunspot.month[-1],
col = adjustcolor("firebrick", 0.15), pch = 19,
main = "sunspot activity lagged by one month")

# lag plots: can lag the data by any number of months,
lag.plot(sunspot.month,
col = adjustcolor("firebrick", 0.15), pch = 19,
main = "sunspot activity lagged by up to 12 months",
lags = 12)

freq <- frequency(sunspot.month)
lag.plot(sunspot.month,
col = adjustcolor("firebrick", 0.15), pch = 19,
main = "sunspot activity lagged by up to 15 months",
set.lags = freq*(1:15))

# jumped by 4 months
lag.plot(sqrt(sunspot.month),
col = adjustcolor("firebrick", 0.15), pch = 19,
main = "sunspot activity^(1/2) lagged 84 tp 180 by 4",
set.lags = seq(84,180, 4))

# identify every 11 years of the series as a cycle.
# determine the starting year of each 11 year cycle
timeseries <- sqrt(sunspot.month)
startYears <- seq(start(timeseries)[1], end(timeseries)[1], 11)
# number of cycles
ncycles <- length(startYears)
cycleNums <- rep(1:ncycles, each = 132)
# only need as many as we have data points
cycleNUms <- cycleNUms[1:length(timeseries)]


# each month will have a position number within its cycle
cyclePositions <- rep(1:132, ncycles)
cyclePositions <- cyclePositions[1:length(timeseries)]
head(cyclePositions)

library(RColorBrewer)
cols <- adjustcolor(rep(brewer.pal(9, name = "Set1"),
length.out = ncycles), alpha.f = 0.5)
opt <- par(bg="grey95")
plot(x = time(timeseries), y =as.vector(timeseries),
type = "b", col=cols[cycleNUms], lwd = 2, pch = 19,
xlab = "Time", ylab = "sqrt monthly sunspots",
main = "sunspot activity")
par(opt)


# comparing groups by overlaying them
opt <- par(bg = "grey95")
data <- window(timeseries, start = c(startYears[1], 1),
end = c(startYears[2] - 1, 12),
frequency = 12)
plot(1:length(data), data,
type = "l", col=cols[1], lwd = 2,
xlab = "Month number in 11 year sequence", ylab = "sqrt monthly sunspots",
main = "all 11 tear sequence: 1749-1983")

for ( i in 2:(ncycles - 1)) {
data <- window(timeseries, start = c(startYears[i], 1),
end = c(startYears[i+1] - 1, 12),
frequency = 12)
lines(1:length(data), data, col = cols[i], lwd = 2)
}

# tidy up the last partial cycle
data <- window(timeseries, start = c(startYears[ncycles], 1),
end = end(timeseries),
frequency = 12)
lines(1:length(data), data, col = cols[ncycles], lwd = 2)
par(opt)



#### an interactive display
library(loon)

# plot the original data, marking 11 year cycles in colour
p <-l_plot(x =time(timeseries), y=as.vector(timeseries),
ylabel="Mean sqrt monthly sunspots",
xlabel="Time",
title="Sunspot activity",
linkingGroup="sunspots",
color = cols[cycleNums], size=1,showGuides=TRUE)

# And all the cycles over top of each ot
cycleplot <-l_plot(x = cyclePositions,
y=as.vector(timeseries),
ylabel="Mean sqrt monthly sunspots",
xlabel="Month number in 11 year sequence",
title="11 year sequences",linkingGroup="sunspots",
color = cols[cycleNums], size=1,showGuides=TRUE)

# Adding a histogram in the same linking group allows
# all to the# plots to be linked together
h <-l_hist(x= cycleNums,xlabel="Cycle number",
title="Sunspot cycles",linkingGroup="sunspots",
color = cols[cycleNums],showStackedColors =TRUE,
showGuides=TRUE)

# now add the lines as layers
data <- window(timeseries, start = c(startYears[ncycles], 1),
end = c(startYears[2] - 1, 12),
frequency = 12)
l_layer_line(cyclesplot, x = 1:length(data), y = data, color = cols[1],
label = paste(start(data)[1], end(data)[1], sep="-"), indx = "end")
l_layer_line(p, x = time(data), y = data, color = cols[1],
label = paste(start(data)[1], end(data)[1], sep="-"), indx = "end")

for (i in 2:(ncycles - 1)){
data <- window(timeseries, start = c(startYears[i], 1),
end = c(startYears[i+1] - 1, 12),
frequency = 12)
l_layer_line(cyclesplot, x = 1:length(data), y = data, color = cols[i],
label = paste(start(data)[1], end(data)[1], sep="-"),
indx = "end")
l_layer_line(p, x = time(data), y = data, color = cols[i],
label = paste(start(data)[1], end(data)[1], sep="-"),
indx = "end")

}

# tidy up the last partial cycles
data <- window(timeseries, start = c(startYears[ncycles], 1),
end = end(timeseries),
frequency = 12)
l_layer_line(p, x = time(data), y = data, color = cols[ncycles],
label = paste(start(data)[1], end(data)[1], sep="-"),
indx = "end")

time series decomposition

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
plot(co2)


# explore it interactively
library(loon)
getDates <- function(ts){
mapply(FUN = function(x,y) {paste(x,y)}, floor(time(co2)),
month.name)
}
data = co2

p <- l_plot(data, title="Muna loa atmospheric CO2",
ylabel = "concentration (ppm)",
xlabel = "Date", linkingGroup = "co2", color = "grey20",
showScales = TRUE, showGuides = TRUE,
itemLabel = getDates(data), showItemLabels = TRUE)

# another way to get the x and y coordinates
xy.raw <- xy.coords(data)
l_layer_line(p, x = xy.raw$x, y = xy.raw$y,
clor = "grey50", linewidth = 3, index = "end",
label = "connect the dots")




# a couple of helper functions
getYears <- function(ts) {unique(floor(time(ts)))}
getYears(co2)

# get the number of the month
# this will actually work more generally using frequency()
getMonthNos <- function(ts){1:frequency(ts)}
getMonthNos(co2)

# we can now computate the averages, first over all months for each year, and
# second over all years for each month
getAves <- function(x, by = "year") {
years <- getYears(x)
monthNos <- getMonthNos(x)
nyears <- length(years)
nmonths <- length(monthNos)
if (! (by %in% c("year", "month"))){
stop("unkwown value for by = ",
by,
"; by must be one of {\"year\",\"month\"}")
}
if (by == "year") {
aves <- sapply(1:nyears,
FUN = function(i) {
mean(window(x,
start = c(years[i], monthNos[1]),
end = c(years[i], monthNos[nmonths])))
})
aves <- data.frame(aves = aves, row.names = years)
} else {
aves <- sapply(1:nmonths,
FUN = function(i) {
mean(window(x,
start = c(years[1], monthNos[i]),
end = c(years[nyears], monthNos[i]),
frequency = 1))
})
aves <- data.frame(aves = aves, row.names = month.abb[monthNos])
}
aves
}



head(getAves(co2, by = "month"))

# plot them
plot(getYears(co2), getAves(co2, by = "year")$aves,
type = "l", xlab = "Year",
ylab = "annual average over ll months")
plot(getAves(co2, by = "month")$aves,
type = "l", xlab = "Year",xaxt = "n",
ylab = "monthly average over all years")
axis(1,at = getMonthNos(co2), labels = month.abb) #month.abb: month abbreviations



# reduce the white space between plots
savePar <- par(mfrow = c(3,1),
mar = c(2,4,1.5,0.5), oma = rep(0.1,4))



# decompositions of a time series
# the default decomposition is an additive one
co2_decomp <- decompose(co2)
# having structure
str(co2_decomp)
plot(co2_decomp)


# we can see how the local fits behave as we change various elements
# xloc is the location where we are fitting
# proportionRange stands for proportion of the x range, which determine
# what is meant by local
# all weights are 1 or zero

ourdata <- data.frame(x = xy.raw$x, y = xy.raw$y)
unitWeight <- function(xloc, x, proportionRange = 0.5){
xhalf <- diff(range(x)) * proportionRange/2
wts <- rep(0, length(x))
wts[x < xloc + xhalf & x > xloc - xhalf] <- 1
wts
}

# we have our data with x and y as before and need weights
xloc <- 0.5
weights <- unitWeight(xloc, xy.raw$x, proportionRange = 1)
# and use thes to fit a line

fitw <- lm(y~x, data = ourdata, weights = weights)


# use a weight function shaped like a Gaussian
GaussianWeight <- function(xloc, x, h = 1){
# normal density
dnorm(x, mean = xloc, sd = h)
}




# the function that will produce a smooth cureve from minimizing the locally
# weighted sum of squares, or LoWeSS.
# it requires:
# x,y: - the data
# xloc : - x locations at which the estimate is to be calculated
# span: - a bandwidth
# weightFn: - a weigthed function ,default will be GaussWeight
# nloc: - number of equi-spaced locations at which estimate will be calculated
# if xloc = NULL, ignored otherwise

LoWeSS <- function(x, y, xloc = NULL, span = 0.1, weightFn = GaussianWeight,
nlocs = 200){
data <- data.frame(x= x, y = y)
xrange <- extendrange(x)
bandwidth <- (diff(xrange)*span)/4

if (is.null(xloc)){
xloc <- seq(xrange[1], xrange[2], length.out = nlocs)
}
estimates <- vector("numeric", length = length(xloc))

for (i in 1:length(xloc)){
weights <- weightFn(xloc[i], x, bandwidth)
fit <- lm(y~x, data = data, weights = weights)
pred <- predict(fit, newdata = data.frame(x= xloc[i]))
estimates[i] <- pred
}
# return the estimates
list(x = xloc, y = estimates)
}

x <- xy.raw$x
y <- xy.raw$y
smooth <- LoWeSS(x,y, nlocs = 1000, span = 0.75)
plot(x,y,col="grey50", pch = 19, main = "our data", type = "p")
lines(smooth$x, smooth$y, col = "steelblue", lwd = 3)


# the data
plot(x, y , col = "grey50", pch = 19, main = "relating y to x")


# the LoWeSS estimate
lines(smooth$x, smooth$y, col = "steelblue", lwd = 3)
# add the true mu(x)
xordered <- sort(x)
library(stpm)
lines(xordered, mu(xordered), col = "firebrick", lwd = 3, lty = 2)
# mu function is in library(stpm)

# locally weigthed sums of squares
fit <- loess(y~x, data = ourdata)
plot(x,y,col = "grey50", pch =19, main = "relating y to x")
# get the values predicted by the loess
pred <- predict(fit)
# get the order of the x
ord <- order(x)
lines(x[ord], pred[ord], lwd = 2, col = "steelblue")


# we could also reduce the span being used here (default is 0.75)
fit_Large <- loess(y~x, data = ourdata, span = 0.15)
pred_Large <- predict(fit_Large)
plot(fit)


# using spans to decompose
# first fit the large span
fit <- loess(y~x, data = ourdata, span = 0.75)
# from the fit we extract the estimated residuals
r_Large <- fit_Large$residuals

# this is another data set that we can plot
plot(x, r_Large, col = "grey50", pch =19,
main = "s_small fit of r_large on x")
# and we can fit this with a smaller span
data_resids <- data.frame(r = r_Large, x = x)
fit_small <- loess(r~x, data = data_resids, span = 0.2)

# get the values predicted by the loess
pred_small <- predict(fit_small)
# need the order of x
ord <- order(x)
lines(x[ord], pred_small[ord], lwd = 2, col = "steelblue")


r_Remains <- fit_small$residuals
data.remains <- data.frame(r = r_Remains, x= x)
fit_Tiny <- loess(r~x, data = data.remains, span = 0.5)
# get the values predicteed by the loess
pred_Tiny <- predict(fit_Tiny)


# putting together for the various fitted smooths we have adecomposition of the
# observed values as
opt <- par(mfrow = c(3,1), mar = c(0,4,4,2) + 0.1)

plot(x[ord], pred_Large[ord], xlab = "", ylab = "", xaxt = "n",
col = "grey50", pch =19, cex = 0.5, main = "large span smooth")

# add a range of residuals rectangle
xleft <- 1.01
xright <- 1.02
yDelta <- sd(r_Remains)
rectCol <- adjustcolor("red", 0.5)
ymiddle <- mean(pred_Large)
rect(xleft, ymiddle - yDelta, xright, ymiddle+yDelta,
border = "grey50", col = rectCol)
plot(x[ord], pred_small[ord], xlab = "", ylab = "", xaxt = "n",
col = "grey50", pch = 19, cex= 0.5, main = "small span smooth")
# add the same range rectangle
ymiddle <- mean(pred_small)
rect(xleft, ymiddle - yDelta, xright, ymiddle+yDelta,
border = "grey50", col = rectCol)

plot(x[ord], pred_small[ord], xlab = "", ylab = "", xaxt = "n",
col = "grey50", pch = 19, cex= 0.5, main = "remaining residuals")
ablines(h =0, col = "lightgrey")

# add the same range rectangle
ymiddle <- mean(r_Remains)
rect(xleft, ymiddle - yDelta, xright, ymiddle+yDelta,
border = "grey50", col = rectCol)

rect(xleft, ybottom, xright, ytop, border = "grey50", col = rectCol)
par(opt)



# the seasonal components now vary a little more over the years
# there are similar arguements t.window and t.degree for the loess smoothing of the
# trend component
plot(stl(co2, s.window = 7, s.degree = 1, t.degree = 1))

library("loon")
l_plot(stl(co2, s.window = 7, s.degree = 1, t.degree = 1))
demo("l_timeseries")

transforming data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
x <- round(rnorm(10), digits = 2)
# the ranks (in order smallest to largest)
rank(x)

# the family of power transformation
powerfun <- function(x, alpha){
if (alpha == 0) {
log(x)
} else {
(x^alpha - 1)/alpha
}
}

library(loon.data); data(SAheart, package = "loon.data")
library(qqtest)
data <- SAheart$sbp
savePar <- par(mfrow = c(1,2))
# note alpha is in descending order here
for (alpha in c(2,1,1/2,1/3,0,-1/3,-1/2,-1,-2)) {
newdata <- powerfun(data, alpha)
dens <- density(newdata, bw = "SJ")
h <- hist(newdata, plot = FALSE)
ylim <- extendrange(c(dens$y, h$density))
plot(h, ylim = ylim, freq = FALSE, yaxt = "n", xaxt = "n",
main = "SAheart", xlab = paste0("(sbp)^", round(alpha, digits = 2)),
ylab ="")
polygon(dens, col = adjustcolor("grey50", 0.3))
qqtest(newdata, main = "normal qqplot", legend = FALSE, cex = 0.5)
}
par(savePar)


# an interactive exploration of chnaging alpha
library(loon)
# something to scale the data
scale01 <- function(x) {
(x - min(x))/diff(range(x) + 1)
}

## setting up the interactive plots that will respond to chaning power
power <- function(x, xlab = NULL, labels = NULL, lowest = -5, highest = 5, ...){
if (is.null(xlab)) {xlab <- "x"}
if (is.null(labels)) {labels <- as.character(1:length(x))}

#scale and order the x values
xs <- scale01(x)
ord <- order(xs)
xs <- xs[ord]
labels <- labels[ord]
# normal quantiles
qs <- qnorm(ppoints(length(x)))
# qqplot
p <- l_plot(x = qs, y = xs, itemLabel = labels,
ylabel = xlab, xlab = "Gaussian qunatiles", ...)
# now we iwll build a more complex interface a histogram plus a slider for the
# power
# the top level window
tw <- tktoplevel()
# create and add the histogram to the top window
hx <- l_hist(xs, xlabel = xlab, yshows = "density", parent = tw, ...)
# now create a slider whose value will be the alpha of the power
# transformation
alpha_x <- tclVar("1")
# slider for x
sx <- tkscale(tw, orient = 'horizontal', label = 'power x',
variable = alpha_x, from = lowest, to = highest,
resolution = 0.1)
# pack the histogram and slider into the same window
tkpack(sx, fill = 'x', side = 'bottom')
tkpack(hx, fill = 'both', expand = TRUE)

# now we need an update function to evaluate when the slider changes

# this is the power function that will change with the slider
update <- function(...){
alphax <- as.numeric(tclvalue(alpha_x))
newx <- sort(scale01(powerfun(x,alphax)))
# update the label
xlabel <- if(alphax ==0){paste0("log(",xlab,")")
} else{
if (alphax == 1){
xlab
} else {
paste0(xlab, "^", alphax)
}
}
xrng <- range(newx)
# update the qqplot p
l_configure(p, y = newx, ylabel = xlabel, xlabel = "Gaussian quantiles")
# update the histogram
binwidthx <- hx['binwidth']
l_configure(hx, x = newx, binwidth = binwidthx, xlabel = xlabel)
l_scaleto_world(hx)
}
# configure the scale to call "update"
tkconfigure(sx, command = update)
invisible(p)
}



with(SAheart,
p <- power(x= sbp, labels = paste("Patient", rownames(SAheart)),
linkingGroup = "SAheart"))

# turkey's ladder of power transformation
data <- rchisq(200, 2)
power(data) # Create a Power Link Object
# if it is concenrated on lower values, then move the power lower on the ladder
# otherwise, in the reverse order

# power transformation
library(MASS)
data(mammals)

# straightening plots via power transformation
power_xy <- function(x, y = NULL, xlab = NULL, ylab = NULL,
from= 5, to = -5,...){
if (is.null(xlab)) {xlab <- "x"}
if (is.null(ylab)) {ylab <- "y"}

scale01 <- function(x) {
(x - min(x))/diff(range(x) + 1)
}
tt <- tktoplevel() # this is about dynamic graphic in r (你可以看一下)
# https://www.stat.berkeley.edu/~spector/s244/Dyngraph.html
# The basic widget in Tk is the frame, which is essentially a container for
# any other widgets you wish to use.
# You can have as many frames as you want, but each individual entity which
# appears on the screen must be in a "top level" frame,
# created with the tktoplevel function.
# When using Tk, you first create a widget (specifying its appearance, the
# command to be executed when the widget is activated, etc.), and then you
# specify how it should appear in its parent frame. While Tk provides a number
# of different schemes for achieving this, we'll discuss the technique known as
# "packing". The basic idea is to group together widgets (and accompanying
# labels, if appropriate) into a series of frames, and then to call the tkpack
# function to put the widgets in the frames, then to finally pack the frames
# into a main frame which is contained in a top-level widget
# By default, objects are backed into a frame from top to bottom, with each
# object being centered within its parent frame. You can control how widgets
# get packed into a frame by the order in which you call tkpack, and the
# optional side= argument which can take the values "left", "right", "top"
# or "bottom"
xs <- scale01(x)
ys <- scale01(y)

# scatterplot
p <- l_plot(x= xs, y = ys, xlabel = xlab, ylabel = ylab,
parent = tt,...)
# a valid Tk parent widget path. When the parent widget is specified (i.e.
# not NULL) then the plot widget needs to be placed using some geometry
# manager like tkpack or tkplace in order to be displaye
fit <- lm(ys~xs)
# layer a fitted line
xrng <- range(xs)
yhat <- predict(fit, data.frame(xs = xrng))
line <- l_layer_line(p, x= xrng, y = yhat, linewidth = 2, index = "end")

# histogram on each
hx <- l_hist(xs, xlabel = xlab, yshows = "density")

# setting up separate sliders for each power transformation
alpha_x <- tclVar('1')
alpha_y <- tclVar('1')
sx <- tkscale(tt, orient = 'vertical', label = 'power x', variable = alpha_x,
from = from, to = to, resoluation = 0.1)
sy <- tkscale(tt, orient = 'vertical', label = 'power x', variable = alpha_y,
from = from, to = to, resoluation = 0.1)
tkpack(sy, fill = 'y', side = 'right')
tkpack(p, fill = 'both', expand = TRUE)

update <- function(...) {
alphax <- as.numeric(tclvalue(alpha_x))
alphay <- as.numeric(tclvalue(alpha_y))
newx <- scale01(powerfun(x, alphax))
newy <- scale01(powerfun(y, alphay))
xlabel <- if (alphax == 0) {paste0("log(", xlab,")")} else{
if (alphax== 1) {xlab}else {
paste0(xlab, "^", alphax)
}
}
ylabel <- if (alphay == 0) {paste0("log(", ylab,")")} else{
if (alphay== 1) {ylab}else {
paste0(ylab, "^", alphay)
}
}
# fit the line to the new data
fit.temp <- lm(newy~newx)
xrng <- range(newx)
## get the fitted line
yhat <- predict(fit.temp, data.frame(newx=xrng))
# update the line values
l_configure(line, y= yhat, x=xrng)
# reconfigure the points
l_configure(p, x = newx, y= newy, xlabel = xlabel, ylabel = ylabel)
# and the histogram
binwidthx <- hx['binwidth']
binwidthy <- hx['binwidth']
l_configure(hx, x = newx, binwidth = binwidth,
xlabel = xlabel)
l_scaleto_world(hx) #Change Plot Region to Display All Plot Data
l_configure(hy, x = newy,
binwidth = binwidthy,
xlabel = ylabel)
l_scaleto_world(hy)
}
# add the update functions to the sliders
tkconfigure(sx, commmand = update)
tkconfigure(sy, commmand = update)
invisbile(p)
}

categorical data overview

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(loon)
areas <- unique(olive[,"Area"])
nAreas <- length(areas)
counts <- rep(0, nAreas)
regions <- c()
# construct the dataset
for (i in 1:nAreas) {
counts[i] <- sum(olive[,"Area"] == areas[i])
regions <- c(regions, unique(olive[olive[,"Area"] == areas[i], "Region"]))
}
# turn regions into a factor with the proper names
regions <- as.factor(levels(olive[,"Region"])[regions])
# our constructed dataset of counts
oliveOilCounts <- data.frame(Region = regions, Area = areas, count= counts)

library(treemap)
treemap(oliveOilCounts, index = c("Region", "Area"), vSize = "count") # vsize could be any variate




# could construct one interactively
itreemap(oliveOilCounts, index = c("Region"), vSize = "count")



# another package that based on ggplot2 for graphics and employs an algorithm which
# tries to achive much squarer boxes

library(treemapify)
library(ggplot2)
treeMapPlot <- ggplot(oliveOilCounts, aes(area = count, fill = Region, # colour by region
label = Area, subgroup = Region))
treeMapPlot + geom_treemap() + geom_treemap_text() +
geom_treemap_subgroup_border() + geom_treemap_subgroup_text()


# alternatively, the coordinates of the squares can be had via the treemapify(...)
# function
library(treemapify)
# alternatively
treeMapCoordinates <- treemapify(oliveOilCounts, area = "count",
subgroup = "Region")
head(treeMapCoordinates)




# non-nested data - venneuler package
library(venneuler) # 姐,这个我不准备run了,因为这个要下载java里的jdk, 还有rstudio
# 里的rjava和vennecular, 我不想下载了
ve <- venneuler(c(A=1, B=1, "A&B" = 1))
plot(ve)

HairEye <- apply(HairEyeColor, 1:2, sum)
library(eikosograms)
eikos(y = "Hair", x = "Eye", data = HairEye, xaxs = FALSE, yaxs = FALSE)

categorical data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#######################################################
# categorical data
#######################################################
city <- c("Kitchener", "Waterloo")
housing <- c("House", "Apartment", "Residence")
# fake data
counts <- rpois(6, lambda = 50)
# arranged in a rectangular array
vacancy <- matrix(counts, nrow = length(city), ncol = length(housing),
byrow = TRUE, dimnames = list(city = city, housing = housing))
# coereced to be an object of class table
vacancy <- as.table(vacancy)
vacancy
# creating a multi-way table
term <- c("Fall", "Winter", "Spring")
# more fake counts
counts <- seq(from = 10, to = 180, by = 10)
vacancy <- array(counts,
dim = c(length(city), length(housing), length(term)),
dimnames = list(city = city, housing = housing, term = term))
vacancy <- as.table(vacancy)



# reordering variates in tables
# the array dimension permutation function aperm(..)
aperm(vacancy, perm = c(3,2,1))



# construting tables from data
library(loon.data)
data("SAheart", package = "loon.data")
table(SAheart$chd, SAheart$famhist, dnn = c("chd", "famhist"))
# by cross-tabulation ("cross tabs or xtabs)
xtabs(~chd + famhist, data = SAheart)


# collapsing dimensions (marginalizing, projecting)
# zero dimensional
margin.table(HairEyeColor)

# 1 dimensional
margin.table(HairEyeColor, margin = 1)
# 2 dimensioanl -- here margins 1 and 2 ("hair", "eye") are preserved
margin.table(HairEyeColor, margin = c(1,2))

# note: except for 0 dimensional. there are the same as using apply with sum
apply(HairEyeColor, MARGIN = 1, FUN = sum)


# adding sums to a table
# every margin is summed
addmargins(HairEyeColor)


# just producing marginal sums over dimension 2("eye") values
# for each pair(i,k) pf remaining variates "hair" and "sex
addmargins(HairEyeColor, margin = 2)


# producing marginal sums over both dimensions 1 and 2 ("hair", and "eye")
addmargins(HairEyeColor, margin = c(1,2))


# sums to proportions
# no margins fixed, just total.. single multinomial
round(prop.table(HairEyeColor), 3)

# one margin is fiexed, (the theird here, i.e. Sex) is fixed,... as may multinomial
# as in
round(prop.table(HairEyeColor, margin = 3),2)
# probability models from tables
# easier to see with sums for a two way table
HairEye <- margin.table(HairEyeColor, margin = c(1,2))
# sum of each table's proportions must be one
round(prop.table(HairEye, margin = 2), 2)


library(tidytable) # provide an implementation of the rules we developed
# for table analysis

tidytable(HairEyeColor)$table

proportions <- round(prop.table(HairEye, margin = 2), 2)
tidyproportions <- tidytable(proportions)
tidyproportions$proportions

categorical modelling

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#######################################################
# categorical modelling - modelling and independence
#######################################################
HairEye <- margin.table(HairEyeColor, margin = c(1,2))
total <- margin.table(HairEye)
colsums <- margin.table(HairEye, margin = 2)
colsums
rowsums <- margin.table(HairEye, margin = 1)
rowsums
rowProbs <- rowsums / total
rowProbs
# Draw one sample for column 1
rmultinom(n = 1, size = colsums[1], prob = rowProbs)

# the function that generates a new table for fixed column totals
# and given row probabilities
generateCondTable <- function(columnSums = seq(100, 300, 100),
rowProbs = seq(0.1,0.4,0.1),
dnn = NULL) {
if (is.null(dnn)) {
dnn <- list(Y = paste0("y_", 1:length(rowProbs)),
X = paste0("x_", 1:length(columnSums))
)
}
nRows <- length(rowProbs)
nCols <- length(columnSums)
result <- matrix(nrow = nRows, ncol = nCols)
for (j in 1:nCols){
result[,j] <- rmultinom(n = 1, size = columnSums[j], prob = rowProbs)
}
result <- as.table(result)
dimnames(result) <- dnn
result
}



# unconditional - NO fixed margin
generateUncondTable <- function(total = 1000, colProbs = seq(0.1, 0.4, 0.1),
rowProbs = seq(0.4, 0.1, -0.1), dnn = NULL){
colSums <- rmultinom(n=1, size = total, prob = colProbs)
generateCondTable(columnSums = colSums, rowProbs = rowProbs, dnn = dnn)
}

# generating a new table under the hypothesis of independence , from an existing
# table of counts
generateTable <- function(Table, y, x, conditionalTest = TRUE) {
new_table <- margin.table(Table, margin = c(y,x))
#get the marginal probabilities for the response variates
rowSums <- margin.table(new_table, margin = y)
rowProbs <- prop.table(rowSums)
# get the marginal probailities for the conditioning variate
colSums <- margin.table(new_table, margin = x)
colProbs <- prop.table(colSums)
# get the dnn
dnn <- dimnames(new_table)
if (conditionalTest){
generateCondTable(columnSums = colSums, rowProbs = rowProbs,
dnn = dnn)
} else {
generateUncondTable(total = sum(new_table),
colProbs = colProbs,
rowProbs = rowProbs,
dnn = dnn)
}
}

# need to functions of the single argument data which will geenrate new data under
# the hypothesis
generateTableDataCond <- function(data){
newtable <- generateTable(data$table, y = data$y, x = data$x,
conditionalTest = TRUE)
list(table = newtable, y= data$y, x = data$x)
}

generateTableDataUncond <- function(data){
newtable <- generateTable(data$table, y = data$y, x = data$x,
conditionalTest = FALSE)
list(table = newtable, y= data$y, x = data$x)
}


# the function that will show an eikosogram
showTable <- function(data, Suspect, draw = FALSE){
result <- eikos(y =data$y, x = data$x,
data = data$table, legend = FALSE,
xlabs = FALSE, ylabs = FALSE,
xaxs = FALSE, yaxs = FALSE,
main = paste("Suspect", Suspect),
draw = draw) # default don't draw
invisible(result)
}


# since the eikos function is implemented using grid, need to update the lineup function

lineup1 <- function(data, showSuspect = NULL, generateSuspect = NUll,
trueLoc = NULL, layout = c(5,4),
pkg = c("Graphics", "grid", "ggplot2")){
# get the number of suspects in total
nSuspects <- layout[1] *layout[2]
if (is.null(trueLoc)) {trueLoc <- sample(1:nSuspects, 1)}
if (is.null(showSuspect)) {stop("need a plot function for the subject")}
if (is.null(generateSuspect)) { stop("need a function to generate subject")}

# need to decide which subject to present
presentSuspect <- function(suspectNo){
if (SuspectNo != trueLoc) {data <- generateSuspect(data)}
showSuspect(data, suspectNo)}

# plotting depends on the plotting package
pkg <- match.arg(pkg) # Argument Verification Using Partial Matching
switch(pkg,
"graphics" = {
savePar <- par(mfrow = layout,
mar = c(2.5,0.1,3,0.1),
oma = rep(0,4))
sapply(1:nSuspects, FUN = presentSuspect)
# the plot layout here is of no value for graphics
plotLayput <- layout
par(savePar)
},
"grid" = {
grobs <- lapply(1:nSuspects, FUN = presentSuspect)
plotLayout <- arrangeGrob(grobs = plots, # Arrange multiple grobs on a page
nrow = layout[1], ncol = layout[2])
},
"ggplot2" = {
plots <- lapply(1:nSuspects, FUN = presentSuspect)
plotLayout <- arrangeGrob(grobs = plots,
nrow = layput[1], ncol = layout[2])
},
stop("Wrong pkg"))
# return obfuscated location and plot (if not "graphics)
list(trueLoc = hideLocation(trueLoc, nSuspects),
plotLayout = plotLayout)
}

results <- lineup(..., pkg = "grid",...) # must supple other args
grid.arrange(results$plotLayout) # display the lineup
revealLocation(results$trueLoc) # reveal the true location

library(eikosograms); library(gridExtra)
results <- lineup1(data,
generateSuspect = generateTableDataCond,
showSuspect = showTable,
layout = c(5,5),
pkg = "grid")

# display equivalence in eikosograms
titanicsurvclass <- margin.table(Titanic, margin = c(1,4))
mosaicplot(titanicsurvclass, main = "MOsaic plot (two way table")

# spine plot
spineplot(titanicsurvclass, main = "spineplot (two way table")



# eikosograms and mosaic plots differ when there are more than two variates
eikos(y = "Survived", x = c("Class", "Sex"),
data = Titanic, xaxs = FALSE, yaxs = FALSE,
main = "Eikosogram (three way table)")

# mosaic plot, need to construct the marginal table first,
# note that the table is ordered so that survived is first,
# then class and sex (to match the eikosogram order above)
titanicsurvclasssex <- margin.table(Titanic, c(4,1,2))

# the mosaic plot
mosaicplot(titanicsurvclasssex, main = "mosaic plot ( three way table")

onmaps

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#######################################################
# onmaps
#######################################################
library(maps)
savePar <- par(mfrow = c(1,2))
# first the map from the maps package
map("world", "Canada", xlim = c(-141, -53), ylim = c(40, 85),
col = "grey90", fill= TRUE)
title(main = "maps package")
# or use the map from the mapdata package
library(mapdata)
map("worldHires", "canada", xlim = c(-141, -53), ylim = c(40, 85),
col = "grey90", fill = TRUE)
title(main = "mapdata package")
par(savePar)

# annual precipitation as thermometer glyphs
library(maps)
precip <- read.csv("/Users/xuanxiaowu/Desktop/stat442/rcode/precip.csv")
# 这个csv可以在网上找到, 如果你想要的话我给你email, 或者等过几天我学一下github

n <- nrow(precip)
lat <- vector("numeric", n)
pop <- vector("numeric", n)
long <- vector("numeric", n)
for (i in 1:n) {
city <- precip[i, "name"]
selected.city <- canada.cities[,"name"] == city
lat[i] <- canada.cities[selected.city, "lat"]
long[i] <- canada.cities[selected.city, "long"]
pop[i] <- canada.cities[selected.city, "pop"]
}

cities <- data.frame(precip, lat = lat, long= long, pop = pop)
# take the subset
indices <- c(5,13,18, 20, 24, 26, 27, 29, 32)
cities <- cities[indices, ]
map("world", "Canada", plot = TRUE, fill = FALSE)
symbols(cities$long - 1, cities$lat, inches = FALSE,
thermometers = cbind(rep(2, nrow(cities)),
rep(4, nrow(cities)),
(cities$precip)/2000), # precip in mm
fg = "red",
add = TRUE)


# statiscal glyphs - any kind of graphic
vars <- c("days", "precip.mm.", "pop")
nVars <- length(vars)
ncities <- nrow(cities)
cityVals <- cities[,vars]
maxVals <- apply(cityVals, 2,max)
# get values as proportions of each column
stdCities <- sweep(cityVals, 2, maxVals,`/`) # sweep: Return an array obtained
#from an input array by sweeping out a summary statistic.
stdCitiesList <- Map(function(rowNum) {t(stdCities[rowNum,])},
1:ncities) # apply a function to each element of a vector
# in the loon package, the function l_make_glyphys() will create glyphs from any
# statiscal graphic
library(loon)
# create the glyphs for each city
my_glyphs <- l_make_glyphs(stdCitiesList,
draw_fun = function(city){
par(mar = rep(1,4))
barplot(height = city,
beside = TRUE,
axes = FALSE,
col = rainbow(nVars),
axisnames = FALSE,
ylim = c(0,1))
}, width = 100, height = 100)
# can view the glyphs
l_imageviewer(my_glyphs)

# these can be added as points symbols to any l_plot
# plot city locations
p <- l_plot(cities$long, cities$lat,
xlabel = "longitude",
ylabel = "laitude",
showLabels = FALSE)
# add the map
Canada <- map("world", "Canada", fill = TRUE, plot = FALSE)
l_map <- l_layer(p, Canada, asSingleLayer = TRUE, color = "cornsilk")
l_scaleto_world(p) #Change Plot Region to Display All Plot Data
# then add the images as glyphs called "barplot"
g <- l_glyph_add_image(p, my_glyphs, "barplot")
p['glyph'] <- g

library(loon)
theEast <- cities$long > -80
p["selected"] <- theEast

# readShapeSpatial {maptools} R Documentation
# Read shape files into Spatial*DataFrame objects


# rm(list = ls()): rm will remove all of the objects that are stored in
# your global environment (which may be what you want) but will not unload any
# of the packages that you have loaded. You can do both by restarting your
# R session in RStudio with the keyboard shortcut Ctrl+Shift+F10 which will
# totally clear your global environment of both objects and loaded packages.
# use graphic.off() to clear all the plots you have generated

three dimensions

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#######################################################
# three dimensions
#######################################################
library(MASS)
library(ggplot2)
# use square grids, count the number of points in each cell
# code count to saturation. here use geom_bin2d() from ggplot2
p <- ggplot(geyser, aes(x = duration, y = waiting))
h <- p + geom_bin2d(bins = 15)
h
# to switch the color around
library(RColorBrewer)
# brew a sequence of blues
blues <- brewer.pal(9, "Blues")
darkerBlues <- blues[-(1:2)] # remove the two lightest blues
h + scale_fill_gradientn(colours = darkerBlues)


# use hexbin for hexagonal binning
library("hexbin")
bins <- hexbin(x = geyser$duration, y = geyser$waiting, xbins = 15)
plot(bins, main = "hexagonal bin histogram")
# alternatively we cna use the hexbinplot directly
hexbinplot(waiting~duration,data = geyser, xbins = 15,
colramp = function(n) rev(gray.colors(n)), # rev:Reverse Elements
aspect = 1,
main = "Hexagonal bin histogram")

# or we could use size instead of the colour to indicate counts
hexbinplot(waiting~duration, data = geyser,
style = "lattice", aspect = 1,
main = "hexagonal bin histogram")
# both colour and size are used
hexbinplot(waiting~duration, data = geyser,
style = "nested.lattice", aspect = 1,
main = "hexagonal bin histogram")


# density estimation in two-dimensions
# calculate the contours of the kernel function
nxvals <- 100
nyvals <- 100
x <- seq(-5, 5,length.out = nxvals)
y <- seq(-5, 5,length.out = nyvals)
gridLocs <- expand.grid(x,y)
# now have the indices to match the locations
indices <- cbind(rep(1:nxvals, times = nyvals), rep(1:nyvals, each = nxvals))
# a matrix to store away the kernel values
kernel <- matrix(0, nrow = nxvals, ncol=nyvals)

# need multivariate nromal densities from mvtnorm
library('mvtnorm')
# bivaraite normal density
# (default is mean 0, and var-cov matrix the 2x2 identity matrix)
kernel[indices] <- dmvnorm(gridLocs)
#there is a contour function to showmlevel curves
contour(x=x, y=y, z= kernel, asp = 1,
xlim = c(-2,2), ylim = c(-2,2),
col = "black", lty = 2,
main = "contours of a standard normal")
points(0,0, pch = 19, col = "steelblue")




# we can use this bivariate density as a kernel for bivariate density estimation
# a function that performs the density estimation is kde2d() from the mass package
x <- geyser$duration
y <- geyser$waiting
den <- kde2d(x=x, y = y , n = 100)
str(den)

# now plot the density estimates as contours
zlim <- range(den$z)
plot(x,y,pch =19,
col = adjustcolor("steelblue", 0.5))
contour(den$x, den$y, den$z, col = "grey10",
levels = pretty(zlim, 10), lwd = 1, add = TRUE)
# pretty :Compute a sequence of about n+1 equally spaced ‘round’ values
# which cover the range of the values in x. The values are chosen so that
# they are 1, 2 or 5 times a power of 10.

# could add colour images of density
library(RColorBrewer)
blues <- brewer.pal(9, "Blues")
image(den$x, den$y, den$z, col = blues, xlab = "duration",
ylab = "waiting", main = "rectangles coloured by density")


# draw the density in three dimensions
# using the perspective "wire frame" drawings
lineCol <- "steelblue"
fillCol <- "cornsilk"
savePar <- par(mfrow = c(1,2))
# 3d surface
persp(den$z, border = lineCol, col = fillCol,
xlab = "duration", ylab = "waiting time",
zlab = "density", main = "geyser density estimate",
sub = "solid")

# see through
persp(den$z, border = lineCol, col = adjustcolor(fillCol,0.5),
xlab = "duration", ylab = "waiting time",
zlab = "density", main = "geyser density estimate",
sub = "transparent")

par(savePar)

# can change the viewing angles
theta.val <- 0 # horizontal moving right to the left
phi.val <- 0 # angle of tilt toward viewer
lineCol <- "steelblue"
fillCol <- "cornsilk"
persp(den$z, border = lineCol, col = fillCol,
xlab = "x", ylab = "y", zlab = "z",
# some magic below:
# bquote: allows partial substitution;
# terms wrapped in .() are evaluated
main = bquote(paste(phi, "=", .(phi.val),";",
thera,"=",.(theta.val))),
theta = theta.val,
phi = phi.val)
# radius r, the distance of the eyepoint from the centre of the box
# changing the radius (think of points at infinity)
persp(den, border = lineCol, col = fillCol,
r = 0.5, phi = 0, xlab = "x", main = "r = 0.5")# the smaller the r, the sooner the parallel lines meet

# the lattice package could be used
library("lattice")
wireframe(den$z, xlab = "duration", ylab = "waiting", zlab = "density",
col = "grey50")

# can change the relative dimensions of the box
wireframe(den$z, xlab = "duration", ylab = "waiting", zlab = "density",
col = "grey50", aspect = c(1.2, 0.4))

library(loon.data); data(SAheart)
cloud(sbp~ldl + tobacco, data = SAheart)

# change the colour, point symbol, remove perspective and the frame box
cloud(sbp~ldl + tobacco, data = SAheart,
col = adjustcolor("grey10", 0.3),
pch = 19, perspective = FALSE,
par.settings = list(axis.line=list(col = NA)))


library(gridExtra)
cloud1 <- cloud(sbp~ldl + tobacco, data = SAheart,
col = adjustcolor("grey10", 0.3),
pch = 19, perspective = TRUE,
par.settings = list(axis.line=list(col = NA)),
# rotate (in degrees) about each axis
screen = list(z= 20, x = -70, y = 0),
xlab = "", ylab = "", zlab = "")
# rotate about y by 3 degrees
cloud2 <- cloud(sbp~ldl + tobacco, data = SAheart,
col = adjustcolor("grey10", 0.3),
pch = 19, perspective = TRUE,
par.settings = list(axis.line=list(col = NA)),
# rotate (in degrees) about each axis
screen = list(z= 20, x = -70, y = 3),
xlab = "", ylab = "", zlab = "")
# stereo display
grid.arrange(cloud1, cloud2, nrow =1)



# use the rgl package
library(rgl)
solidcol <- "steelblue"
persp3d(den$z, col = solidcol)
# with a three button mouse:
#left buttom - rotate
#middle button - change perspective
# right button - size)
# get rid of perspective, make things orthogonal
# FOV(or field of view) is "the degee of parallax"
# FOV = 0, is orthogonal (or isometric) projection.
par3d(FOV = 0)
persp3d(den$z, col = "cornsilk")



# interacitve three dimensional plots
# surfaces and planes
x <- den$x
y <- den$y
z <- den$z
# put the coordinates in a unit cube
myscale <- function(x){
xrange <- range(x)
(x - min(xrange))/diff(xrange)
}
x <- myscale(x)
y <- myscale(y)
z <- myscale(z)

surface3d(x,y,z, color = solidcol)
# we could ass an arbitrary plane as well
# say the plane a*x + b*y + c*z + d = 0
planes3d(0,0,1, -median(z), col = "firebrick", alpha =0.5)
planes3d(0,0,1, -quantile(z, 0.75), col = "steelblue", alpha =0.5)

# a point in the centre of the box is
centre <- c(median(x), median(y), median(z) )
# a plane's orientation is given by its normal vector
a <- 1
b <- 1
c <- 1
normal <- c(a,b,c)
# get d to put this through the centre
d <- - t(normal) %*% centre
planes3d(a,b,c,d, col = "purple", alpha= 0.7)

marginal contribution

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#######################################################
# marginal contribution - projections and slices
#######################################################



# distance determines the amount of perspective ,default is 0.2
cloud <- cloud(obesity~alcohol + tobacco, data = SAheart,
distance = 1, perspective = TRUE,
par.settings = list(axis.line=list(col = NA)),
# rotate (in degrees) about each axis
screen = list(z= 0, x = 10, y = 0),
xlab = "x", ylab = "y", zlab = "z")

# no perspective = marginal = orthogonal projection


# three dimensional scatterplot - interactive
library(loon)
cols <- rep("firebrick", length(SAheart$chd))
cols[SAheart$chd == "Yes"] <- "steelblue"
# need a new data frame
newdata <- data.frame(sbp = SAheart$sbp, ldl = SAheart$ldl,
tob = SAheart$tobacco)
# create the navigation graph
l_navgraph(data = newdata,
# optional arguements follow
color = cols, glyph = "ccircle", showGuides = TRUE)

# a free hand rotation in loon
with(l_scale3D(newdata),
l_plot3D(x = sbp, y = ldl, z = tob, color = cols,
showGuides = TRUE, glyph = "ccircle"))


# conditioninh on one or more variate values
# beginning with a density estimate for sbp
library(ggplot2)
ggplot(data= SAheart, mapping = aes(sbp)) +
geom_density(fill = "steelblue", alpha =0.5)


# now condition on family history - by color
p <- ggplot(data= SAheart, mapping = aes(sbp)) +
geom_density(mapping = aes(fill = famhist), alpha = 0.5)


# now condition on family history - by locations and color
p + facet_grid(famhist~.)
p + facet_grid(.~famhist)

# now condition on faimly history (colour and location) and value of chd (location)
p + facet_grid(chd ~ famhist, labeller = label_both)

# a two variates display conditional on the value of a third,
p <- ggplot(data = SAheart, mapping = aes(ldl, sbp))
p + geom_point(color = "steelblue")
# condition on famhist(colour)
p + geom_point(mapping = aes(color = famhist))
# conditional on famhist (colour+location)
p + geom_point(mapping = aes(color = famhist)) + facet_grid(.~famhist)
# conditional on famhist (colour + location) - densities
p + geom_density_2d() +
facet_grid(.~famhist, labeller = label_both)

# conditional on a continuous variate, say tobacco - colour
p + geom_point(mapping = aes(color = tobacco), size = 2, alpha = 0.5)




# conditional on a continuous variate, say tobacco -size
p + geom_point(mapping = aes(size = tobacco), color = "steelblue", alpha = 0.5)


# cut_interval(): make n groups having equal range
# cut_number(): makes n groups having euqal numbers of observations
# cut_width(): makes however many groups of constant width

# conditioning on one or more variate values
p + geom_density2d(color = "steelblue") +
facet_grid(.~cut_number(tobacco, n = 3))

p + geom_point(color = "steelblue",alpha =0.5) +
facet_grid(.~cut_number(tobacco, n = 3))

# turn an ggplot into an interactive loon plot
# 这个要额外下载package, 网站https://cran.r-project.org/web/packages/loon.ggplot/index.html
gg_p <- p + geom_density2d(color = "steelblue") + geom_point() +
facet_grid(.~cut_number(tobacco, n =3))
myplot <- loon.ggplot(gg_p, linkingGroup = "SAheart")
# marginalization = orthogonal projection


p2 <- l_plot(SAheart$tobacco, jitter(rep(1, length(SAheart$tobacco))),
linkingGroup = "SAheart")
p3D <- l_plot3D(l_scale3D(SAheart[,c("obesity", "tobacco", "alcohol")]),
linkingGroup = "SAheart")


noncartesian

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

#######################################################
# noncartesian 1 - beyong three dimensions
#######################################################
data <- iris[,1:4] # the four measurements on Anderson's irises
scale01 <- function(x) {
(x - min(x))/diff(range(x) + 1)
}

data <- apply(data, 2, scale01)
library(TeachingDemos)
faces2(data[c(1,51,101), ],
which = c(8,13,14,17), # select features
scale = "none", nrows = 1, ncols = 3)


# with the appropriate packages, we can explore how variables these features are:
library(tkrplot)
if(interactive()){
tke2 <- rep(list(
list('slider',
from = 0, to = 1, init = 0.5,
resolution = 0.1)),
18)
names(tke2) <- c('CenterWidth','TopBottomWidth','FaceHeight','TopWidth',
'BottomWidth','NoseLength','MouthHeight','MouthCurve',
'MouthWidth','EyesHeight','EyesBetween','EyeAngle',
'EyeShape','EyeSize','EyeballPos','EyebrowHeight',
'EyebrowAngle','EyebrowWidth')
tkfun2 <- function(...){
tmpmat <- rbind(Min = 0, Adjust = unlist(list(...)), Max = 1)
faces2(tmpmat, scale = 'none')
}
tkexamp(tkfun2, list(tke2), plotloc = 'left',
hscale = 2, vscale = 2)
# Create Tk dialog boxes with controls to show examples of
# changing parameters on a graph.
}

n <- nrow(data)
faces2(data[sample(1:n, n, replace = FALSE),],
which = c(6,7,8,12),
scale = "columns",
nrow = 10, ncol = 15)

# a function that in R called stars(..), which will produce Nightingale's rose style glyphs
n <- nrow(data)
stars(data[sample(1:n, n, replace= FALSE), ],
nrow =10, ncol = 15,
main = "Nightingales Rose (almost)",
draw.segments = TRUE, cex = 1.2)
# data explorations using stars(..) again
stars(data, locations =data[,1:2],
lwd = 0.5, # narrow the line
len = 0.05,
labels = NULL, radius = TRUE,
col.lines = rep(adjustcolor("steelblue", 0.5), n),
col.stars = rep(adjustcolor("steelblue", 0.5), n),
nrow = 10, ncol = 15,
key.loc = c(12,-0.5),
main = "iris data"
)



Tableau

ctrl + shift + B -> 放大workplace
ctrl + B -> 缩小workplace

Join和blend

Join:在data source中点击要join的sheet,看到sheet的信息后注意点击中上方表的小长方块,然后拖动要join的另一个table,中间会出现两个重合的圆,来调节join的类型.

Blending是特别版本的join,join只能join同类型的data,但blending可以join不同版本的,比如excel join到sql dataset.
Join时会说left和right table,blending中叫做primary data source和secondary data source.

当Blending时有两个相同的key.
Blending will result certain level of data aggregation(sum, avg, etc…)
default: sum相加
如果是无法sum的char值,会返回星号*.
如果Blending时没有相同的key,会返回null.

Scattor plot

我们发现scattor plot需要三个值.两个负责建立坐标(columns + rows)还有一个是数值,导入marks里面.

比如column:transation + rows:amount + marks:customer_id
customer_id作为每个customer的信息导入.

关于出现的outlier:删掉或对column/row进行对数(log)转换.

对数转换:使用calculate field中的sum和log function.

按子类别计算更多的detail:

增加calculated field:
{ fixed[Sub category]: avg([Amount]) }
并增加在mark中的size上.
以及因为是value的计算,tableau会自动求和,但我们不需要求整个sub category的sum,所以得改成avg.

Clusters:在analytics的区域,给scattor plot分不同的群体.
把在marks中生成的cluster可以拖入会data里的string value区域,可以保留并用在其他的图中.

Bar chart

Bar chart需要一个分组作为rows(刚刚的cluster group),以及一个数值(amount)作为值(marks).然后点击右上方的show me中的bar chart.
How many ppl in a cluster group, add calculated field:
CountD([Customer ID])

我们可以做一个两头bar chart:
将distinct customer ID也作为column值,做一个左右并排的bar chart.再选中其中的一个右键选取edit axis并reversed.

在最上方的工具栏里有swap rows and columns很好用.

Dashboard

将chart(sheet)集合在一起的地方,还能加action,实时比较不同category下的分析数据,很牛逼.
例如:将某个category base的bar chart右键use as filter,那么点击某个category,其他的图标就会计算那个category下的数据.

正常流程:在dashboard tool bar中添加action,选择highlight就只会high light,不会把其他的数据隐藏.
source sheet:用于选取的category base的sheet.

在查看图表每个数值的时候可以修改tooltip整理每个数值的标签.
可以将其他想要加入的数据添加到marks的detail里,再在tooltip中的insert加入.

Marks上面的filters可以用category类的值来选择要研究的数据.

常用普通表格之colouring.
可以将measure value按住ctrl拖入color(复制一份使用colour),然后选择square,表格上就会按照数据的大小决定颜色深浅.
如果有多列的数据,在colour中有use separate legend可以分别辨识颜色深浅.