您的位置:时时app平台注册网站 > 编程知识 > Manipulating Data - 总计数据

Manipulating Data - 总计数据

2019-12-06 10:53

  ddply和aggregate是三个用来组成数据的效率强盛的函数。

使用summaryBy

使用summarizeBy()函数瓦解数据:

library(doBy)

# 给每一组运行长度、均值、标准差等函数
# 每一组依据性别 条件划分
cdata <- summaryBy(change ~ sex   condition, data=data, FUN=c(length,mean,sd))
cdata
#>   sex condition change.length change.mean change.sd
#> 1   F   aspirin             5   -3.420000 0.8642916
#> 2   F   placebo            12   -2.058333 0.5247655
#> 3   M   aspirin             9   -5.411111 1.1307569
#> 4   M   placebo             4   -0.975000 0.7804913

# Rename column change.length to just N
names(cdata)[names(cdata)=="change.length"] <- "N"

# Calculate standard error of the mean
cdata$change.se <- cdata$change.sd / sqrt(cdata$N)
cdata
#>   sex condition  N change.mean change.sd change.se
#> 1   F   aspirin  5   -3.420000 0.8642916 0.3865230
#> 2   F   placebo 12   -2.058333 0.5247655 0.1514867
#> 3   M   aspirin  9   -5.411111 1.1307569 0.3769190
#> 4   M   placebo  4   -0.975000 0.7804913 0.3902456

细心假使您有别的被试内变量,那几个专门的学业误值在比对组别差距时就没用了。参见 ../../Graphs/Plotting means and error bars (ggplot2)卡塔尔获取怎么样绘制被试内变量的抽样误差棒。

   ddply

  上面是ddply函数的雷同用法:

ddply(.data, .variables, .fun = NULL, ..., .progress = "none",.inform = FALSE, .drop = TRUE, .parallel = FALSE, .paropts = NULL)

   例:

# Summarize a dataset by two variables
dfx <- data.frame(
  group = c(rep('A', 8), rep('B', 15), rep('C', 6)),
  sex = sample(c("M", "F"), size = 29, replace = TRUE),
  age = runif(n = 29, min = 18, max = 54)
) 
head(dfx)
 group sex      age
1     A   M 22.44750
2     A   M 52.92616
3     A   F 30.00443
4     A   M 39.56907
5     A   M 18.89180
6     A   F 50.81139
#Note the use of the '.' function to allow
# group and sex to be used without quoting
ddply(dfx, .(group, sex), summarize,mean = round(mean(age), 2),sd = round(sd(age), 2))
  group sex  mean    sd
1     A   F 40.41 14.71
2     A   M 30.35 13.17
3     B   F 34.81 12.76
4     B   M 34.04 13.36
5     C   F 35.09 13.39
6     C   M 28.53  4.57
# An example using a formula for .variables
ddply(baseball[1:100,], ~ year, nrow)
 year V1
1 1871  7
2 1872 13
3 1873 13
4 1874 15
5 1875 17
6 1876 15
7 1877 17
8 1878  3
# Applying two functions; nrow and ncol
ddply(baseball, .(lg), c("nrow", "ncol"))
 lg  nrow ncol
1       65   22
2 AA   171   22
3 AL 10007   22
4 FL    37   22
5 NL 11378   22
6 PL    32   22
7 UA     9   22
# Calculate mean runs batted in for each year
rbi <- ddply(baseball, .(year), summarise,mean_rbi = mean(rbi, na.rm = TRUE))
head(rbi)
  year mean_rbi
1 1871 22.28571
2 1872 20.53846
3 1873 30.92308
4 1874 29.00000
5 1875 31.58824
6 1876 30.13333
# Plot a line chart of the result
plot(mean_rbi ~ year, type = "l", data = rbi)
# make new variable career_year based on the
# start year for each player (id)
base2 <- ddply(baseball, .(id), mutate,career_year = year - min(year)   1)
head(base2)

         id year stint team lg   g  ab   r   h X2b X3b hr rbi sb cs bb so ibb hbp sh sf gidp career_year
1 aaronha01 1954     1  ML1 NL 122 468  58 131  27   6 13  69  2  2 28 39  NA   3  6  4   13           1
2 aaronha01 1955     1  ML1 NL 153 602 105 189  37   9 27 106  3  1 49 61   5   3  7  4   20           2
3 aaronha01 1956     1  ML1 NL 153 609 106 200  34  14 26  92  2  4 37 54   6   2  5  7   21           3
4 aaronha01 1957     1  ML1 NL 151 615 118 198  27   6 44 132  1  1 57 58  15   0  0  3   13           4
5 aaronha01 1958     1  ML1 NL 153 601 109 196  34   4 30  95  4  1 59 49  16   1  0  3   21           5
6 aaronha01 1959     1  ML1 NL 154 629 116 223  46   7 39 123  8  0 51 54  17   4  0  9   19           6

 

使用 ddply

library(plyr)

# 给每一组运行长度、均值、标准差等函数
# 每一组依据性别 条件划分
cdata <- ddply(data, c("sex", "condition"), summarise,
               N    = length(change),
               mean = mean(change),
               sd   = sd(change),
               se   = sd / sqrt(N)
)
cdata
#>   sex condition  N      mean        sd        se
#> 1   F   aspirin  5 -3.420000 0.8642916 0.3865230
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867
#> 3   M   aspirin  9 -5.411111 1.1307569 0.3769190
#> 4   M   placebo  4 -0.975000 0.7804913 0.3902456

  aggregate(x, ...)

  关于aggregate(卡塔尔国函数的使用在《酷路泽语言实战》中P105有简要描述,这里再度说一下。此函数重要有须臾间三种用法:   

    ## Default S3 method:
  aggregate(x, ...)

  ## S3 method for class 'data.frame'
  aggregate(x, by, FUN, ..., simplify = TRUE, drop = TRUE)

  ## S3 method for class 'formula'
  aggregate(formula, data, FUN, ...,subset, na.action = na.omit)

  ## S3 method for class 'ts'
  aggregate(x, nfrequency = 1, FUN = sum, ndeltat = 1,ts.eps = getOption("ts.eps"), ...) 

 

 

例:    

attach(mtcars)
aggdata <-aggregate(mtcars, by=list(cyl,gear), FUN=mean, na.rm=TRUE)
aggdata
 Group.1 Group.2    mpg cyl     disp       hp     drat       wt    qsec  vs   am gear     carb
1       4       3 21.500   4 120.1000  97.0000 3.700000 2.465000 20.0100 1.0 0.00    3 1.000000
2       6       3 19.750   6 241.5000 107.5000 2.920000 3.337500 19.8300 1.0 0.00    3 1.000000
3       8       3 15.050   8 357.6167 194.1667 3.120833 4.104083 17.1425 0.0 0.00    3 3.083333
4       4       4 26.925   4 102.6250  76.0000 4.110000 2.378125 19.6125 1.0 0.75    4 1.500000
5       6       4 19.750   6 163.8000 116.5000 3.910000 3.093750 17.6700 0.5 0.50    4 4.000000
6       4       5 28.200   4 107.7000 102.0000 4.100000 1.826500 16.8000 0.5 1.00    5 2.000000
7       6       5 19.700   6 145.0000 175.0000 3.620000 2.770000 15.5000 0.0 1.00    5 6.000000
8       8       5 15.400   8 326.0000 299.5000 3.880000 3.370000 14.5500 0.0 1.00    5 6.000000

   得到数码框aggdata,个中的Group.1和Group.2的列名可以钦定,只需第二行写成:

aggdata <-aggregate(mtcars, by=list(Group.cyl=cyl, Group.gears=gear),FUN=mean, na.rm=TRUE)

 即可。

  注意:在采纳aggregate(卡塔尔(قطر‎函数的时候, by中的变量必需在三个列表中(纵然独有二个变量) 。 钦赐的函数FUN可为随便的内建或自编函数 。

  别的的某件事例:  

## Compute the averages for the variables in 'state.x77', grouped
## according to the region (Northeast, South, North Central, West) that
## each state belongs to.
aggregate(state.x77, list(Region = state.region), mean)
## Compute the averages according to region and the occurrence of more
## than 130 days of frost.
aggregate(state.x77,
          list(Region = state.region,Cold = state.x77[,"Frost"] > 130),
          mean)
## (Note that no state in 'South' is THAT cold.)
## example with character variables and NAs
testDF <- data.frame(v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9),
                     v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99) )
by1 <- c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12)
by2 <- c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)
aggregate(x = testDF, by = list(by1, by2), FUN = "mean")
# and if you want to treat NAs as a group
fby1 <- factor(by1, exclude = "")
fby2 <- factor(by2, exclude = "")
aggregate(x = testDF, by = list(fby1, fby2), FUN = "mean")
## Formulas, one ~ one, one ~ many, many ~ one, and many ~ many:
aggregate(weight ~ feed, data = chickwts, mean)
aggregate(breaks ~ wool   tension, data = warpbreaks, mean)
aggregate(cbind(Ozone, Temp) ~ Month, data = airquality, mean)
aggregate(cbind(ncases, ncontrols) ~ alcgp   tobgp, data = esoph, sum)
## Dot notation:
aggregate(. ~ Species, data = iris, mean)
aggregate(len ~ ., data = ToothGrowth, mean)
## Often followed by xtabs():
ag <- aggregate(len ~ ., data = ToothGrowth, mean)
xtabs(len ~ ., data = ag)
## Compute the average annual approval ratings for American presidents.
aggregate(presidents, nfrequency = 1, FUN = mean)
## Give the summer less weight.
aggregate(presidents, nfrequency = 1,
          FUN = weighted.mean, w = c(1, 1, 0.5, 1))

方案

有二种办法描述基于一些特定变量的分组数据,然后对每生机勃勃组利用计算函数(像均值、标准差等等)。

  • ddply()函数:它比较简单选拔,但须求载入plyr包。这种办法恐怕正是您要找的(表明很几人用呗,好用呗)。
  • summaryBy()函数:它也正如易于采用,然则它须要载入doBy包。
  • aggregate()函数,它相比较难使用一些但松开于科雷傲中。

若是你有以下数据并想求得每黄金年代组样品大小、均值的改换、规范差以至均值的标准误,而这边的分裂是依靠性别和标准化钦点的:F-placebo, F-aspirin, M-placeboM-aspirin

data <- read.table(header=TRUE, text='
 subject sex condition before after change
       1   F   placebo   10.1   6.9   -3.2
       2   F   placebo    6.3   4.2   -2.1
       3   M   aspirin   12.4   6.3   -6.1
       4   F   placebo    8.1   6.1   -2.0
       5   M   aspirin   15.2   9.9   -5.3
       6   F   aspirin   10.9   7.0   -3.9
       7   F   aspirin   11.6   8.5   -3.1
       8   M   aspirin    9.5   3.0   -6.5
       9   F   placebo   11.5   9.0   -2.5
      10   M   placebo   11.9  11.0   -0.9
      11   F   aspirin   11.4   8.0   -3.4
      12   M   aspirin   10.0   4.4   -5.6
      13   M   aspirin   12.5   5.4   -7.1
      14   M   placebo   10.6  10.6    0.0
      15   M   aspirin    9.1   4.3   -4.8
      16   F   placebo   12.1  10.2   -1.9
      17   F   placebo   11.0   8.8   -2.2
      18   F   placebo   11.9  10.2   -1.7
      19   M   aspirin    9.1   3.6   -5.5
      20   M   placebo   13.5  12.4   -1.1
      21   M   aspirin   12.0   7.5   -4.5
      22   F   placebo    9.1   7.6   -1.5
      23   M   placebo    9.9   8.0   -1.9
      24   F   placebo    7.6   5.2   -2.4
      25   F   placebo   11.8   9.7   -2.1
      26   F   placebo   11.8  10.7   -1.1
      27   F   aspirin   10.1   7.9   -2.2
      28   M   aspirin   11.6   8.3   -3.3
      29   F   aspirin   11.3   6.8   -4.5
      30   F   placebo   10.3   8.3   -2.0
 ')

使用 aggregate

aggregate函数相比较难用,但它放到于ENVISION,所以无需依照其余包。

# 对每个目录 (sex*condition) 中的对象计数
cdata <- aggregate(data["subject"], by=data[c("sex","condition")], FUN=length)
cdata
#>   sex condition subject
#> 1   F   aspirin       5
#> 2   M   aspirin       9
#> 3   F   placebo      12
#> 4   M   placebo       4

# 重命名 "subject" 列为 "N"
names(cdata)[names(cdata)=="subject"] <- "N"
cdata
#>   sex condition  N
#> 1   F   aspirin  5
#> 2   M   aspirin  9
#> 3   F   placebo 12
#> 4   M   placebo  4

# 按性别排序
cdata <- cdata[order(cdata$sex),]
cdata
#>   sex condition  N
#> 1   F   aspirin  5
#> 3   F   placebo 12
#> 2   M   aspirin  9
#> 4   M   placebo  4

# 我们也保留 before 和 after列:
# 得到性别和条件下的平均影响大小
# Get the average effect size by sex and condition
cdata.means <- aggregate(data[c("before","after","change")], 
                         by = data[c("sex","condition")], FUN=mean)
cdata.means
#>   sex condition   before     after    change
#> 1   F   aspirin 11.06000  7.640000 -3.420000
#> 2   M   aspirin 11.26667  5.855556 -5.411111
#> 3   F   placebo 10.13333  8.075000 -2.058333
#> 4   M   placebo 11.47500 10.500000 -0.975000

# 融合数据框
cdata <- merge(cdata, cdata.means)
cdata
#>   sex condition  N   before     after    change
#> 1   F   aspirin  5 11.06000  7.640000 -3.420000
#> 2   F   placebo 12 10.13333  8.075000 -2.058333
#> 3   M   aspirin  9 11.26667  5.855556 -5.411111
#> 4   M   placebo  4 11.47500 10.500000 -0.975000

# 得到标准差
cdata.sd <- aggregate(data["change"],
                      by = data[c("sex","condition")], FUN=sd)
# 重命名列
names(cdata.sd)[names(cdata.sd)=="change"] <- "change.sd"
cdata.sd
#>   sex condition change.sd
#> 1   F   aspirin 0.8642916
#> 2   M   aspirin 1.1307569
#> 3   F   placebo 0.5247655
#> 4   M   placebo 0.7804913

# 融合
cdata <- merge(cdata, cdata.sd)
cdata
#>   sex condition  N   before     after    change change.sd
#> 1   F   aspirin  5 11.06000  7.640000 -3.420000 0.8642916
#> 2   F   placebo 12 10.13333  8.075000 -2.058333 0.5247655
#> 3   M   aspirin  9 11.26667  5.855556 -5.411111 1.1307569
#> 4   M   placebo  4 11.47500 10.500000 -0.975000 0.7804913

# 计算标准误
cdata$change.se <- cdata$change.sd / sqrt(cdata$N)
cdata
#>   sex condition  N   before     after    change change.sd change.se
#> 1   F   aspirin  5 11.06000  7.640000 -3.420000 0.8642916 0.3865230
#> 2   F   placebo 12 10.13333  8.075000 -2.058333 0.5247655 0.1514867
#> 3   M   aspirin  9 11.26667  5.855556 -5.411111 1.1307569 0.3769190
#> 4   M   placebo  4 11.47500 10.500000 -0.975000 0.7804913 0.3902456

意气风发旦您有NA值想要跳过,设置 na.rm=TRUE:

cdata.means <- aggregate(data[c("before","after","change")], 
                         by = data[c("sex","condition")],
                         FUN=mean, na.rm=TRUE)
cdata.means
#>   sex condition   before     after    change
#> 1   F   aspirin 11.06000  7.640000 -3.420000
#> 2   M   aspirin 11.26667  5.855556 -5.411111
#> 3   F   placebo 10.13333  8.075000 -2.058333
#> 4   M   placebo 11.47500 10.500000 -0.975000

最初的小说链接:http://www.cookbook-r.com/Manipulating_data/Summarizing_data/

问题

你想要根据界别总括你的数码(均值、标准差等等)。

用零填满空组合

奇迹总括的数据框中存在因子的空组合 - 那意味是,因子组合也许存在,但土生土养数据框里又从未实际现身。它在自动填满有NA值的多少框时有用。要成功那或多或少,当调用ddplysummarySE时设置.drop=FALSE。(这里小编翻译的不是很满足,大家能够查阅原作)

例子:

# 首先移除所有 Male Placebo 条目
dataSub <- subset(data, !(sex=="M" & condition=="placebo"))

# 如果我们总结数据,在本来有 Male Placebo 的地方会存在空行
# 因为这个组合已经被我们删除了
summarySE(dataSub, measurevar="change", groupvars=c("sex", "condition"))
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767

# 设置 .drop=FALSE 指定不要扔掉这个组合
summarySE(dataSub, measurevar="change", groupvars=c("sex", "condition"), .drop=FALSE)
#> Warning in qt(conf.interval/2   0.5, datac$N - 1): NaNs produced
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
#> 4   M   placebo  0       NaN        NA        NA        NA

用零填满空组合

不常计算的数据框中存在因子的空组合 - 那意思是,因子组合可能存在,但原有数据框里又从未实际现身。它在机关填满有NA值的多寡框时有用。

以此例子将会用0填满缺点和失误的组合:

fillMissingCombs <- function(df, factors, measures) {

    # Make a list of the combinations of factor levels
    levelList <- list()
    for (f in factors) {  levelList[[f]] <- levels(df[,f])  }

    fullFactors <- expand.grid(levelList)

    dfFull <- merge(fullFactors, df, all.x=TRUE)

    # Wherever there is an NA in the measure vars, replace with 0
    for (m in measures) {
      dfFull[is.na(dfFull[,m]), m] <- 0
    }

    return(dfFull)
}

应用例子:

# First remove some all Male Placebo entries from the data
dataSub <- subset(data, !(sex=="M" & condition=="placebo"))

# If we summarize the data, there will be a missing row for Male Placebo,
# since there were no cases with this combination.
cdataSub <- summarySE(dataSub, measurevar="change", groupvars=c("sex", "condition"))
cdataSub
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767

# This will fill in the missing combinations with zeros
fillMissingCombs(cdataSub, factors=c("sex","condition"), measures=c("N","change","sd","se","ci"))
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
#> 4   M   placebo  0  0.000000 0.0000000 0.0000000 0.0000000

自行总计函数

不像我们刚刚手动地钦点想要的值然后计算规范误,这一个函数能够活动管理全体的细节。它能够干以下的事体:

  • 查找均值、标准差和计数
  • 搜求均值的正规化误(重申,借使您管理的是被试内变量那也许不是你想要的,,参见../../Graphs/Plotting means and error bars (ggplot2)卡塔尔获取怎么绘制组内变量的固有误差棒新闻。关于被试设计难点,能够点击详细询问)
  • 寻觅95%的置信区间(也足以钦定别的值)
  • 重命令结果数据集的变量名,那样更平价后续处理

要选用的话,把函数放你的代码中然后像上边相仿调用它。

## Summarizes data.
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
    library(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
      .fun = function(xx, col) {
        c(N    = length2(xx[[col]], na.rm=na.rm),
          mean = mean   (xx[[col]], na.rm=na.rm),
          sd   = sd     (xx[[col]], na.rm=na.rm)
        )
      },
      measurevar
    )

    # Rename the "mean" column    
    datac <- rename(datac, c("mean" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2   .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

比如使用它(用95%的置信区间)。与后面手动总结这几个手续相反,summarySE函数一步解决:

summarySE(data, measurevar="change", groupvars=c("sex", "condition"))
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
#> 4   M   placebo  4 -0.975000 0.7804913 0.3902456 1.2419358

# With a data set with NA's, use na.rm=TRUE
summarySE(dataNA, measurevar="change", groupvars=c("sex", "condition"), na.rm=TRUE)
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  4 -3.425000 0.9979145 0.4989572 1.5879046
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  7 -5.142857 1.0674848 0.4034713 0.9872588
#> 4   M   placebo  3 -1.300000 0.5291503 0.3055050 1.3144821

管理缺失值

要是数量中存在NA值,你供给加上na.rm=TRUE分选。平时你能够在summaryBy()函数中设置,但length()函数识别不了那几个选项。风流罗曼蒂克种缓和措施是依据length()函数定义二个新的取长度函数去管理NA值。

# 新版的length函数可以处理NA值,如果na.rm=T,则不对NA计数
length2 <- function (x, na.rm=FALSE) {
    if (na.rm) sum(!is.na(x))
    else       length(x)
}

# 给数据添加一些NA值
dataNA <- data
dataNA$change[11:14] <- NA

cdataNA <- summaryBy(change ~ sex   condition, data=dataNA,
                     FUN=c(length2, mean, sd), na.rm=TRUE)
cdataNA
#>   sex condition change.length2 change.mean change.sd
#> 1   F   aspirin              4   -3.425000 0.9979145
#> 2   F   placebo             12   -2.058333 0.5247655
#> 3   M   aspirin              7   -5.142857 1.0674848
#> 4   M   placebo              3   -1.300000 0.5291503

# Now, do the same as before

拍卖缺失值

借使数额中留存NA值,供给给各个函数加多na.rm=TRUE标记去除缺点和失误值。因为函数length()没有na.rm分选,所以能够运用sum(!is.na(...))的方式对非缺点和失误值进行计数。

# 给数据加些NA值
dataNA <- data
dataNA$change[11:14] <- NA

cdata <- ddply(dataNA, c("sex", "condition"), summarise,
               N    = sum(!is.na(change)),
               mean = mean(change, na.rm=TRUE),
               sd   = sd(change, na.rm=TRUE),
               se   = sd / sqrt(N)
)
cdata
#>   sex condition  N      mean        sd        se
#> 1   F   aspirin  4 -3.425000 0.9979145 0.4989572
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867
#> 3   M   aspirin  7 -5.142857 1.0674848 0.4034713
#> 4   M   placebo  3 -1.300000 0.5291503 0.3055050

电动总结函数

(注意这里的机关计算函数与事前的分化,它是通过summaryBy实现的)

不像我们刚刚手动地钦命想要的值然后总结标准误,这一个函数能够自行管理全体的内部原因。它能够干以下的作业:

  • 检索均值、标准差和计数
  • 找出均值的专门的学业误(强调,若是你管理的是被试内变量那大概不是您想要的,,参见../../Graphs/Plotting means and error bars (ggplot2)卡塔尔国获取怎么绘制组内变量的模型误差棒音讯。关于被试设计难点,能够点击详细摸底)
  • 搜寻95%的置信区间(也能够钦定别的值)
  • 重命令结果数据集的变量名,那样更有协助后续管理

要利用以来,把函数放你的代码中然后像上面同样调用它。

## Summarizes data.
## Gives count, mean, standard deviation, standard error of the mean, and confidence 
## interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, conf.interval=.95) {
    library(doBy)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # Collapse the data
    formula <- as.formula(paste(measurevar, paste(groupvars, collapse="   "), sep=" ~ "))
    datac <- summaryBy(formula, data=data, FUN=c(length2,mean,sd), na.rm=na.rm)

    # Rename columns
    names(datac)[ names(datac) == paste(measurevar, ".mean",    sep="") ] <- measurevar
    names(datac)[ names(datac) == paste(measurevar, ".sd",      sep="") ] <- "sd"
    names(datac)[ names(datac) == paste(measurevar, ".length2", sep="") ] <- "N"

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2   .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

举个例证使用它(用95%的置信区间)。与事情未发生前手动计算那些步骤相反,summarySE函数一步解决:

summarySE(data, measurevar="change", groupvars=c("sex","condition"))
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
#> 4   M   placebo  4 -0.975000 0.7804913 0.3902456 1.2419358

# With a data set with NA's, use na.rm=TRUE
summarySE(dataNA, measurevar="change", groupvars=c("sex","condition"), na.rm=TRUE)
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  4 -3.425000 0.9979145 0.4989572 1.5879046
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  7 -5.142857 1.0674848 0.4034713 0.9872588
#> 4   M   placebo  3 -1.300000 0.5291503 0.3055050 1.3144821

本文由时时app平台注册网站发布于编程知识,转载请注明出处:Manipulating Data - 总计数据

关键词: