Min = min(x, na.rm = T), Range = max(x,na.rm = T) - min(x,na.rm = T), Mean = mean(x, na.rm = T), SD = sd(x, na.rm = T), CV = sd(x, na.rm = T)/mean(x, na.rm = T) * 100) } sm = T), Min = min(x, na.rm = T), Range = max(x,na.rm = T) - min(x,na.rm = T), Mean = mean (x, na.rm = T), SD = sd(x, na.rm = T), CV = sd(x, na.rm = T)/mean(x, na.rm = T) * 100) } CV = sd(values,na.rm = T)/mean(values,na.rm=T)) `summarise()` ungrouping output (override with `.groups
FALSE当做1、0 #计算缺失值个数 sum(is.na(an)) #单数列,sum一下可以直接计算“Ture”的数值和 colSums(is.na(an),na.rm = T) #多维数列,按列,na.rm为是否需要忽略缺失值,na.rm=T表示忽略,删除 rowSums(is.na(an),na.rm = T) #多维数列,按行,na.rm为是否需要忽略缺失值 ,na.rm=T表示忽略,删除 #数据框中的缺失值操作 #数据框中的缺失值操作 y <- an[is.na(an)] #选中缺失值 y<- an[is.na(an)=="TRUE
Stackoverflow上的解答让我大开眼界,下面给一个通用的计算函数: gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){ if(any(x < 0, na.rm = TRUE)){ return(NaN) } if(zero.propagate){ if(any(x == 0 , na.rm = TRUE)){ return(0) } exp(mean(log(x), na.rm = na.rm)) } else { exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) } } 最后一个参数指定是否容忍0的存在。
is.na(change)), mean = mean(change, na.rm=TRUE), sd = sd(change, na.rm ==T, don't count them length2 <- function (x, na.rm=FALSE) { if (na.rm) sum(! =na.rm), mean = mean (xx[[col]], na.rm=na.rm), sd = sd (xx[[col]], na.rm= # 新版的length函数可以处理NA值,如果na.rm=T,则不对NA计数 length2 <- function (x, na.rm=FALSE) { if (na.rm) sum(! ==T, don't count them length2 <- function (x, na.rm=FALSE) { if (na.rm) sum(!
编写函数 huizong = function (dd) { func <- function(x) { c(Max = max(x, na.rm = T), Min = min (x, na.rm = T), Range = max(x,na.rm = T) - min(x,na.rm = T), Mean = mean(x, na.rm = T), SD = sd(x, na.rm = T), CV = sd(x, na.rm = T)/mean(x, na.rm = T) * 100) } sm <- as.data.frame = T), Min = min(x, na.rm = T), Range = max(x,na.rm = T) - min(x,na.rm = T), Mean = mean(x, na.rm = T), SD = sd(x, na.rm = T), CV = sd(x, na.rm = T)/mean(x, na.rm = T) *
., na.rm = FALSE) min(..., na.rm = FALSE) pmax(..., na.rm = FALSE) pmin(..., na.rm = FALSE) pmax.int (..., na.rm = FALSE) pmin.int(..., na.rm = FALSE) 描述和用法说的都很清楚了 如果还不是很清楚,就可以运行实例 2.example > example
mydata <- data.frame(a,b,c) # 利用以创建的变量构建数据框 ##基本统计量计算 mean(a) #由于有NA值,直接计算平均值返回的是NA # [1] NA mean(a,na.rm = T) #去除NA值后再计算平均值 #[1] 2.75 sum(a,na.rm = T) #去除NA值后再求和 #[1] 11 sd(a,na.rm = T) #去除NA值后再计算标准差 #[1] 1.707825 var(a,na.rm = T) #去除NA值后再计算方差 #[1] 2.916667 sqrt(var(a,na.rm = T)) #方差取平方根后就是标准差,计算结果和sd()一致 #[1] 1.707825 min(a,na.rm = T) #去除NA值后再计算最小值 #[1] 1 max(a,na.rm = T) #去除NA值后再计算最大值 #[1] 5 median(a,na.rm = T) #去除NA值后再计算中位数 # [1] 2.5 quantile(a,na.rm = T) #去除NA值后再计算分位数(后续统计部分会有讲解) # 0% 25% 50% 75% 100%
q025, y = q05, ymax = q075), data = summarise( group_by(little.mcar.p, n), q025 = quantile(p, .025, na.rm = TRUE), q05 = quantile(p, .05, na.rm = TRUE), q075 = quantile(p, .075, na.rm = TRUE) )) + geom_hline = q95, ymax = q975), data = summarise( group_by(little.mcar.p.mar, n), q925 = quantile(p, .925, na.rm = TRUE), q95 = quantile(p, .95, na.rm = TRUE), q975 = quantile(p, .975, na.rm = TRUE) ), linetype
remove_outliers <- function(x, na.rm = TRUE, ...) { qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm H <- 1.5 * IQR(x, na.rm = na.rm) y <- x y[x < (qnt[1] - H)] <- NA y[x > (qnt[2] + H)] <- NA y
q025, y = q05, ymax = q075), data = summarise( group_by(little.mcar.p, n), q025 = quantile(p, .025, na.rm = TRUE), q05 = quantile(p, .05, na.rm = TRUE), q075 = quantile(p, .075, na.rm = TRUE) )) + geom_hline = q95, ymax = q975), data = summarise( group_by(little.mcar.p.mar, n), q925 = quantile(p, .925, na.rm = TRUE), q95 = quantile(p, .95, na.rm = TRUE), q975 = quantile(p, .975, na.rm = TRUE) ), linetype
平均值是通过取数值的总和并除以数据序列中的值的数量来计算,函数mean()用于在R中计算平均值,语法如下: mean(x, trim = 0, na.rm = FALSE, ...) na.rm – 用于从输入向量中删除缺少的值。 如果缺少值,则平均函数返回NA,我们如果要从计算中删除缺少的值,可以使用na.rm = TRUE, 这意味着删除NA值。 好啦,来综合看下实例: 输出结果为: 数据系列中的中间值被称为中位数,在R中使用median()函数来计算中位数,语法如下: median(x, na.rm = FALSE) 参数描述如下: x na.rm – 用于从输入向量中删除缺少的值。 众数是指给定的一组数据集合中出现次数最多的值,不同于平均值和中位数,众数可以同时具有数字和字符数据。
library(dplyr) starwars %>% group_by(gender) %>% summarise(mass_maximum = max(mass, na.rm = TRUE function(data, var, by) { data %>% group_by({{ by }}) %>% summarise(maximum = max({{ var }}, na.rm by }}) %>% summarise(...) } starwars %>% summarise_by( average = mean(height, na.rm = TRUE), maximum = max(height, na.rm = TRUE), by = gender ) #> # A tibble: 5 x 3 var, by) { data %>% group_by(.data[[by]]) %>% summarise(maximum = max(.data[[var]], na.rm
", "#3bceac") 数据清洗 df_mean <- df %>% group_by(type) %>% summarise(n_artists = sum(artists_n, na.rm ) df_base <- artists %>% group_by(type, state) %>% summarise( n_artists = sum(artists_n, na.rm = TRUE), lq = sum(log(location_quotient)*artists_n/sum(artists_n, na.rm = TRUE), na.rm = TRUE))
] == split, xynames, drop = FALSE], MARGIN = 2, FUN = median, na.rm plot$layers[[1]] mapping: colour = ~celltype.l1, x = ~Umap1, y = ~Umap2 geom_scattermore: na.rm = FALSE , interpolate = FALSE, pointsize = 1, pixels = c(512, 512) stat_identity: na.rm = FALSE position_identity = FALSE stat_identity: na.rm = FALSE position_identity [[2]] mapping: x = ~mpg, y = ~cyl, label = ~carb geom_text: parse = FALSE, check_overlap = FALSE, na.rm = FALSE stat_identity: na.rm = FALSE position_identity
my_describe <- function(x){ options(digits = 3) N = length(x); Nmiss = sum(is.na(x)); Min = min(x, na.rm = TRUE); Q1 = quantile(x, probs = 0.25, na.rm = TRUE); Median = median(x, na.rm = TRUE); Q3 = quantile(x, probs = 0.75, na.rm = TRUE); Max = max(x, na.rm = TRUE); Mean = mean(x, na.rm = TRUE) ; Sd = sd(x, na.rm = TRUE); Range = abs(diff(range(x))); skew <- sum((x-Mean)^3/Sd^3)/N kurt
R实现 #整理成描述性统计的函数 my_describe <- function(x){ options(digits = 3) N = length(x); Min = min(x, na.rm = TRUE); Q1 = quantile(x, probs = 0.25, na.rm = TRUE); Median = median(x, na.rm = TRUE); Q3 = quantile(x, probs = 0.75, na.rm = TRUE); Max = max(x, na.rm = TRUE); Mean = mean(x, na.rm = TRUE) ; Var = var(x, na.rm = TRUE); Sd = sd(x, na.rm = TRUE); Range = abs(diff(range(x))); #返回结果
arrow.fill = NULL, lineend = "butt", linejoin = "round", na.rm = FALSE) { data <- ggplot2:::remove_missing( data, na.rm = na.rm, c("x", "y", "xend", arrow.fill = NULL, lineend = "butt", linejoin = "round", na.rm = FALSE) { data <- ggplot2:::remove_missing( data, na.rm = na.rm, c("x", "y", "xend",
minimum minimum算法如下 E <- E - Eb for (slide in 1:ncol(E)) { i <- E[, slide] < 1e-18 if (any(i, na.rm i, slide], na.rm = TRUE) E[i, slide] <- m/2 } } 和half算法类似,也是为了避免负值问题,对于校正后的每一列,以1e-18作为阈值 4. movingmin movingmin算法如下 E <- E - ma3x3.matrix(Eb, FUN = min, na.rm = TRUE) 5. edwards edwards算法如下 matrix(1, nrow(E), 1) delta.vec <- function(d, f = 0.1) { quantile(d, mean(d < 1e-16, na.rm = TRUE) * (1 + f), na.rm = TRUE) } sub <- E - Eb delta <- one %
", "#3bceac") 数据清洗 df_mean <- df %>% group_by(type) %>% summarise(n_artists = sum(artists_n, na.rm ) df_base <- artists %>% group_by(type, state) %>% summarise( n_artists = sum(artists_n, na.rm = TRUE), lq = sum(log(location_quotient)*artists_n/sum(artists_n, na.rm = TRUE), na.rm = TRUE))
# 计算平均值的函数,可以指定是否移除 NA 值 calculate_mean <- function(data_vector, na.rm = TRUE) { if (na.rm) { data_vector <- na.omit(data_vector) } return(mean(data_vector, na.rm = FALSE)) # na.rm=FALSE 是为了避免mean函数内部再次处理 NA } values <- c(1, 2, 3, NA, 5) # 使用默认值 na.rm = TRUE mean1 <- calculate_mean(values) print(mean1) # 输出: 2.75 # 明确指定 na.rm = FALSE mean2 <- calculate_mean(values, na.rm = FALSE) print(mean2) # 输出: NA median_val <- median(expression_values, na.rm = FALSE) sd_val <- sd(expression_values, na.rm = FALSE