Data set obtained from Kutner et al. (2005).
This data set consists of a random sample of 113 hospitals selected from the original 338 hospitals surveyed.
Each line of the data set has an identification number and provides information on 11 other variables for a single hospital. The data presented here are for the 1975-76 study period.
The 12 variables are:
| Variable Number | Variable Name | Description |
|---|---|---|
| 1 | Identification number | 1-113 |
| 2 | Length of stay | Average length of stay of all patients in hospital (in days) |
| 3 | Age | Average age of patients (in years) |
| 4 | Infection risk | Average estimated probability of acquiring infection in hospital (in percent) |
| 5 | Routine culturing ratio | Ratio of number of cultures performed to number of patients without signs or symptoms of hospital-acquired infection, times 100 |
| 6 | Routine chest X-ray ratio | Ratio of number of X-rays performed to number of patients without signs or symptoms of pneumonia, times 100 |
| 7 | Number of beds | Average number of beds in hospital during study period |
| 8 | Medical school affiliation | 1 =Yes, 2=No |
| 9 | Region | Geographic region, where: 1 =NE, 2=NC, 3=S, 4=W |
| 10 | Average daily census | Average number of patients in hospital per day during study period |
| 11 | Number of nurses | Average number of full-time equivalent registered and licensed practical nurses during study period (number full-time plus one half the number part time) |
| 12 | Available facilities and services | Percent of 35 potential facilities and services that are provided by the hospital |
Data are available from https://web.tecnico.ulisboa.pt/giovani.silva/ec/LabClasses_data1.txt
#setwd("C:/Privado/Disciplinas/EC/Exercicios") # set your windows working directory
#setwd("~/Privado/Disciplinas/EC/Exercicios") # set your linux working directory
mydata <- read.table("LabClasses_data1.txt", header=FALSE)
#mydata <- read.table("LabClasses_data1.txt", header=TRUE, sep=",", row.names="id")
dim(mydata)
## [1] 113 12
head(mydata)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 1 7.13 55.7 4.1 9.0 39.6 279 2 4 207 241 60
## 2 2 8.82 58.2 1.6 3.8 51.7 80 2 2 51 52 40
## 3 3 8.34 56.9 2.7 8.1 74.0 107 2 3 82 54 20
## 4 4 8.95 53.7 5.6 18.9 122.8 147 2 4 53 148 40
## 5 5 11.20 56.5 5.7 34.5 88.9 180 2 1 134 151 40
## 6 6 9.76 50.9 5.1 21.9 97.0 150 2 2 147 106 40
names(mydata)[1] <- "ID"
names(mydata)
## [1] "ID" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11" "V12"
mydata$V9 <- factor(mydata$V9, levels = c(1,2,3,4), labels = c("NE", "NC", "S","W"))
head(mydata)
## ID V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 1 7.13 55.7 4.1 9.0 39.6 279 2 W 207 241 60
## 2 2 8.82 58.2 1.6 3.8 51.7 80 2 NC 51 52 40
## 3 3 8.34 56.9 2.7 8.1 74.0 107 2 S 82 54 20
## 4 4 8.95 53.7 5.6 18.9 122.8 147 2 W 53 148 40
## 5 5 11.20 56.5 5.7 34.5 88.9 180 2 NE 134 151 40
## 6 6 9.76 50.9 5.1 21.9 97.0 150 2 NC 147 106 40
mydata$V2[mydata$V2==99] <- NA
newdata1 = mydata[!complete.cases(mydata),]
newdata1 <- na.omit(mydata)
summary(mydata)
## ID V2 V3 V4 V5
## Min. : 1 Min. : 6.700 Min. :38.80 Min. :1.300 Min. : 1.60
## 1st Qu.: 29 1st Qu.: 8.340 1st Qu.:50.90 1st Qu.:3.700 1st Qu.: 8.40
## Median : 57 Median : 9.420 Median :53.20 Median :4.400 Median :14.10
## Mean : 57 Mean : 9.648 Mean :53.23 Mean :4.355 Mean :15.79
## 3rd Qu.: 85 3rd Qu.:10.470 3rd Qu.:56.20 3rd Qu.:5.200 3rd Qu.:20.30
## Max. :113 Max. :19.560 Max. :65.90 Max. :7.800 Max. :60.50
## V6 V7 V8 V9 V10
## Min. : 39.60 Min. : 29.0 Min. :1.00 NE:28 Min. : 20.0
## 1st Qu.: 69.50 1st Qu.:106.0 1st Qu.:2.00 NC:32 1st Qu.: 68.0
## Median : 82.30 Median :186.0 Median :2.00 S :37 Median :143.0
## Mean : 81.63 Mean :252.2 Mean :1.85 W :16 Mean :191.4
## 3rd Qu.: 94.10 3rd Qu.:312.0 3rd Qu.:2.00 3rd Qu.:252.0
## Max. :133.50 Max. :835.0 Max. :2.00 Max. :791.0
## V11 V12
## Min. : 14.0 Min. : 5.70
## 1st Qu.: 66.0 1st Qu.:31.40
## Median :132.0 Median :42.90
## Mean :173.2 Mean :43.16
## 3rd Qu.:218.0 3rd Qu.:54.30
## Max. :656.0 Max. :80.00
attach(mydata)
mean(ID, na.rm=TRUE)
## [1] 57
sapply(mydata, mean, na.rm=TRUE)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## ID V2 V3 V4 V5 V6 V7
## 57.000000 9.648319 53.231858 4.354867 15.792920 81.628319 252.168142
## V8 V9 V10 V11 V12
## 1.849558 NA 191.371681 173.247788 43.159292
library(psych)
describe.by(mydata, V9)
## Warning in describe.by(mydata, V9): describe.by is deprecated. Please use the
## describeBy function
##
## Descriptive statistics by group
## group: NE
## vars n mean sd median trimmed mad min max range skew
## ID 1 28 52.00 32.36 49.00 51.04 34.84 5.00 112.00 107.00 0.33
## V2 2 28 11.09 2.67 10.72 10.70 1.92 8.03 19.56 11.53 1.59
## V3 3 28 54.26 3.97 54.10 54.22 3.11 45.20 65.90 20.70 0.31
## V4 4 28 4.86 1.27 4.85 4.85 1.33 2.50 7.70 5.20 0.10
## V5 5 28 21.20 9.31 20.00 20.64 9.93 7.70 46.00 38.30 0.56
## V6 6 28 91.05 20.26 91.20 90.91 15.57 56.00 133.50 77.50 -0.11
## V7 7 28 259.32 184.15 222.00 230.42 128.24 52.00 835.00 783.00 1.73
## V8 8 28 1.82 0.39 2.00 1.88 0.00 1.00 2.00 1.00 -1.59
## V9 9 28 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 NaN
## V10 10 28 210.36 165.72 172.50 184.67 108.97 37.00 791.00 754.00 1.84
## V11 11 28 190.61 145.83 161.50 169.75 81.54 35.00 656.00 621.00 1.60
## V12 12 28 47.14 12.72 45.70 46.42 12.75 28.60 80.00 51.40 0.51
## kurtosis se
## ID -1.24 6.12
## V2 2.48 0.50
## V3 1.34 0.75
## V4 -0.70 0.24
## V5 -0.08 1.76
## V6 -0.66 3.83
## V7 2.81 34.80
## V8 0.55 0.07
## V9 NaN 0.00
## V10 3.66 31.32
## V11 2.17 27.56
## V12 -0.32 2.40
## ------------------------------------------------------------
## group: NC
## vars n mean sd median trimmed mad min max range skew
## ID 1 32 55.00 30.35 56.50 55.08 35.58 2.00 109.00 107.00 -0.04
## V2 2 32 9.68 1.19 9.80 9.69 1.34 7.39 12.07 4.68 -0.08
## V3 3 32 51.82 4.45 51.85 52.13 3.48 38.80 59.60 20.80 -0.73
## V4 4 32 4.39 1.34 4.40 4.45 1.11 1.30 7.80 6.50 -0.21
## V5 5 32 15.94 12.93 13.45 13.55 7.04 2.20 60.50 58.30 1.95
## V6 6 32 83.17 18.83 85.40 83.78 15.64 42.60 116.90 74.30 -0.32
## V7 7 32 279.72 206.17 195.50 258.88 150.48 56.00 752.00 696.00 0.83
## V8 8 32 1.78 0.42 2.00 1.85 0.00 1.00 2.00 1.00 -1.30
## V9 9 32 2.00 0.00 2.00 2.00 0.00 2.00 2.00 0.00 NaN
## V10 10 32 208.94 158.99 155.00 191.42 114.16 38.00 595.00 557.00 0.91
## V11 11 32 185.50 133.87 150.50 172.96 99.33 14.00 469.00 455.00 0.84
## V12 12 32 45.71 15.43 47.15 46.15 14.90 5.70 71.40 65.70 -0.29
## kurtosis se
## ID -1.14 5.37
## V2 -0.79 0.21
## V3 0.73 0.79
## V4 0.43 0.24
## V5 3.78 2.28
## V6 -0.49 3.33
## V7 -0.80 36.45
## V8 -0.32 0.07
## V9 NaN 0.00
## V10 -0.46 28.11
## V11 -0.61 23.66
## V12 -0.39 2.73
## ------------------------------------------------------------
## group: S
## vars n mean sd median trimmed mad min max range skew
## ID 1 37 61.65 34.01 65.0 62.32 38.55 3.00 113.00 110.00 -0.15
## V2 2 37 9.19 1.22 9.0 9.16 1.25 7.08 11.46 4.38 0.33
## V3 3 37 53.62 4.35 52.4 53.48 3.26 45.00 63.90 18.90 0.43
## V4 4 37 3.93 1.46 4.2 3.91 1.63 1.30 7.60 6.30 0.08
## V5 5 37 12.89 8.03 12.0 12.29 7.26 1.60 42.00 40.40 1.22
## V6 6 37 75.09 15.00 78.9 75.67 13.94 40.40 97.90 57.50 -0.38
## V7 7 37 250.51 189.62 182.0 229.26 140.85 29.00 833.00 804.00 1.10
## V8 8 37 1.92 0.28 2.0 2.00 0.00 1.00 2.00 1.00 -2.95
## V9 9 37 3.00 0.00 3.0 3.00 0.00 3.00 3.00 0.00 NaN
## V10 10 37 189.65 142.72 130.0 175.19 105.26 20.00 547.00 527.00 0.82
## V11 11 37 160.59 135.94 118.0 143.84 108.23 19.00 519.00 500.00 1.05
## V12 12 37 39.54 15.24 40.0 39.18 16.90 11.40 77.10 65.70 0.17
## kurtosis se
## ID -1.33 5.59
## V2 -0.87 0.20
## V3 -0.24 0.72
## V4 -0.45 0.24
## V5 2.45 1.32
## V6 -0.79 2.47
## V7 0.51 31.17
## V8 6.87 0.05
## V9 NaN 0.00
## V10 -0.57 23.46
## V11 0.16 22.35
## V12 -0.54 2.51
## ------------------------------------------------------------
## group: W
## vars n mean sd median trimmed mad min max range skew
## ID 1 16 59.00 36.57 60.50 59.43 51.15 1.0 111.00 110.00 -0.20
## V2 2 16 8.11 1.00 7.81 8.07 1.00 6.7 10.16 3.46 0.56
## V3 3 16 53.37 5.22 53.95 53.41 3.56 42.0 64.10 22.10 -0.18
## V4 4 16 4.38 0.88 4.45 4.42 0.59 2.6 5.60 3.00 -0.53
## V5 5 16 12.73 6.39 12.25 12.51 6.60 2.6 26.00 23.40 0.45
## V6 6 16 77.16 22.15 77.80 76.59 23.35 39.6 122.80 83.20 0.17
## V7 7 16 188.38 190.86 118.50 151.36 67.46 64.0 831.00 767.00 2.33
## V8 8 16 1.88 0.34 2.00 1.93 0.00 1.0 2.00 1.00 -2.06
## V9 9 16 4.00 0.00 4.00 4.00 0.00 4.0 4.00 0.00 NaN
## V10 10 16 127.00 142.15 69.50 101.00 37.06 37.0 581.00 544.00 2.04
## V11 11 16 147.62 152.22 83.00 121.29 50.41 35.0 629.00 594.00 1.97
## V12 12 16 39.46 17.15 34.25 38.57 16.83 17.1 74.30 57.20 0.53
## kurtosis se
## ID -1.48 9.14
## V2 -0.91 0.25
## V3 -0.12 1.31
## V4 -0.47 0.22
## V5 -0.72 1.60
## V6 -0.79 5.54
## V7 5.11 47.72
## V8 2.40 0.09
## V9 NaN 0.00
## V10 3.65 35.54
## V11 3.43 38.05
## V12 -1.14 4.29
# 2-Way Frequency Table
mytable <- table(V8,V9)
mytable
## V9
## V8 NE NC S W
## 1 5 7 3 2
## 2 23 25 34 14
margin.table(mytable, 1) # V1 frequencies (summed over V9)
## V8
## 1 2
## 17 96
prop.table(mytable) # cell percentages
## V9
## V8 NE NC S W
## 1 0.04424779 0.06194690 0.02654867 0.01769912
## 2 0.20353982 0.22123894 0.30088496 0.12389381
prop.table(mytable, 1) # row percentages
## V9
## V8 NE NC S W
## 1 0.2941176 0.4117647 0.1764706 0.1176471
## 2 0.2395833 0.2604167 0.3541667 0.1458333
prop.table(mytable, 2) # column percentages
## V9
## V8 NE NC S W
## 1 0.17857143 0.21875000 0.08108108 0.12500000
## 2 0.82142857 0.78125000 0.91891892 0.87500000
newdata2 <- mydata[c(2:7,10:12)]
cor(newdata2, use="complete.obs", method="kendall")
## V2 V3 V4 V5 V6 V7
## V2 1.00000000 0.07302776 0.40008264 0.2339432 0.21842355 0.34431426
## V3 0.07302776 1.00000000 -0.01122091 -0.1253279 -0.05269865 -0.09212224
## V4 0.40008264 -0.01122091 1.00000000 0.4009442 0.26801135 0.30384270
## V5 0.23394317 -0.12532795 0.40094421 1.0000000 0.33050923 0.13089296
## V6 0.21842355 -0.05269865 0.26801135 0.3305092 1.00000000 0.05257325
## V7 0.34431426 -0.09212224 0.30384270 0.1308930 0.05257325 1.00000000
## V10 0.36844614 -0.09531398 0.30724965 0.1377288 0.05511563 0.89413630
## V11 0.32119104 -0.10292278 0.34334225 0.1915855 0.07885987 0.77515449
## V12 0.26863379 -0.03105364 0.28057481 0.1476115 0.05089780 0.68074341
## V10 V11 V12
## V2 0.36844614 0.32119104 0.26863379
## V3 -0.09531398 -0.10292278 -0.03105364
## V4 0.30724965 0.34334225 0.28057481
## V5 0.13772882 0.19158545 0.14761153
## V6 0.05511563 0.07885987 0.05089780
## V7 0.89413630 0.77515449 0.68074341
## V10 1.00000000 0.77511887 0.65148888
## V11 0.77511887 1.00000000 0.67733726
## V12 0.65148888 0.67733726 1.00000000
cov(newdata2, use="complete.obs")
## V2 V3 V4 V5 V6 V7
## V2 3.653664 1.611089760 1.367262721 6.390979 14.156843 150.85939
## V3 1.611090 19.905940265 0.006539981 -10.312897 -1.628946 -50.61076
## V4 1.367263 0.006539981 1.798034134 7.673785 11.772361 93.03087
## V5 6.390979 -10.312897440 7.673784766 104.749235 84.220292 275.77352
## V6 14.156843 -1.628945954 11.772360936 84.220292 374.957762 171.09966
## V7 150.859392 -50.610761694 93.030870733 275.773522 171.099660 37188.30183
## V10 139.277148 -37.576232617 78.638353350 224.955333 187.317059 29087.96373
## V11 90.605599 -51.537428887 73.572890329 283.499984 208.675063 24587.06511
## V12 10.330431 -2.743423673 8.410021334 28.802031 32.945538 2329.04887
## V10 V11 V12
## V2 139.27715 90.60560 10.330431
## V3 -37.57623 -51.53743 -2.743424
## V4 78.63835 73.57289 8.410021
## V5 224.95533 283.49998 28.802031
## V6 187.31706 208.67506 32.945538
## V7 29087.96373 24587.06511 2329.048870
## V10 23642.00348 19441.14815 1818.550087
## V11 19441.14815 19394.84877 1658.644998
## V12 1818.55009 1658.64500 231.066185
hist(mydata$V4, col="blue")
density(mydata$V4)
##
## Call:
## density.default(x = mydata$V4)
##
## Data: mydata$V4 (113 obs.); Bandwidth 'bw' = 0.3914
##
## x y
## Min. :0.1258 Min. :0.000165
## 1st Qu.:2.3379 1st Qu.:0.022280
## Median :4.5500 Median :0.075605
## Mean :4.5500 Mean :0.112789
## 3rd Qu.:6.7621 3rd Qu.:0.175519
## Max. :8.9742 Max. :0.355245
library(sm)
## Warning: package 'sm' was built under R version 4.4.3
## Package 'sm', version 2.2-6.0: type help(sm) for summary information
sm.density.compare(V4, V9, xlab="Infection risk")
title(main="IR Distribution by Regions")
colfill<-c(1:length(levels(V9)))
legend(8,0.4, levels(V9), fill=colfill)
barplot(table(V9), main="Region distribution", xlab="Regions")
barplot(table(V9), main="Region distribution", horiz=TRUE, names.arg=c("NE", "NC", "S", "W"))
boxplot(V4 ~ V9,data=mtcars, main="hospital-acquired infection", xlab="Regions", ylab="Infection risk")
plot(V3, V4, main=" Scatterplot ", xlab="Age ", ylab="Infection risk ", pch=19)
pairs(~V2+V3+V4,data=newdata2,
main="Simple Scatterplot Matrix")
Kutner, M., Nachtsheim, C., Neter, J., Li, W. (2005). Applied Linear Statistical Models, 5th Edition. New York: McGraw-Hill/Irwin.
Kabacoff, R.I. (2017). Quick-R by DataCamp website.