--- title: "BAPC" author: "Research Team" date: "2023-08-29" output: html_document --- #1.R包的安装与加载(所需R包较多,请确保可以加载成功,若加载不成功,请先安装) ```{R} library("data.table") library("tidyverse") library("caTools") library("fanplot") library("cmprsk") library("etm") library("BAPC") library("INLA") library("reshape") library("reshape2") library("tidyr") library("epitools") library("ggplot2") library("ggfan") ``` #2.求一个location, 一个population(sex&age),1990-2019的一个measure和metric的数据 ```{R} #举例:location:Global;Sex: Both; Age: Age-standardized; Measure: Deaths; Metric: Rate ##2.1 数据导入 ###数据导入 PAD_Global_5SDI_1990_to_2019 <- read.csv("./data/PAD_Global_5SDI_1990_to_2019.csv",header = T) ##2.2 数据清洗(select及filter函数筛选列与行) Global_Bothsex_Deaths_ASR <- PAD_Global_5SDI_1990_to_2019 %>% select("location_name","year","sex_name","age_name","measure_name","metric_name","val","upper","lower") %>% filter(location_name == 'Global', sex_name == "Both", age_name == "Age-standardized", measure_name == "Deaths", metric_name == 'Rate') ##2.3 可视化 A <- ggplot()+ geom_point(data=Global_Bothsex_Deaths_ASR,aes(x = year, y = val),color="#107B3A",shape = 24)+ geom_line(data=Global_Bothsex_Deaths_ASR,aes(x = year, y = val),color="#107B3A",linewidth=1)+ scale_y_continuous(breaks = c(0.6,0.8,1.0,1.2,1.4,1.6),limits = c(0.5,1.7))+ xlab("Year")+ ylab("ASMR")+ theme(panel.background = element_rect(fill = "transparent"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",linewidth = 0.5), axis.text.x = element_text(colour = 'black',size = 6), axis.text.y = element_text(colour = 'black',size = 6), axis.title.x = element_text(colour = 'black',size = 8), axis.title.y = element_text(colour = 'black',size = 8)) #legend.position = 'bottom', #legend.key = element_blank(), #legend.title = element_blank()) #theme里面设置:面(panel) + 轴(axis) + 图例 ##2.4 导出 ggsave(A, file = "./output/PAD_Global_bothsex_1990_to_2019_death_ASR.pdf", width = 10, height = 8, units = "cm") ``` #3.求1个location, 3个population(3sex&age),1990-2019的一个measure和metric的数据 ```{R} #举例:location:Global;Sex: Both,Male,Female; Age: Age-standardized; Measure: Deaths; Metric: Rate ##3.1 数据导入 PAD_Global_5SDI_1990_to_2019 <- read.csv("./data/PAD_Global_5SDI_1990_to_2019.csv",header = T) ##3.2 数据清洗 Global_threesex_Deaths_ASR <- PAD_Global_5SDI_1990_to_2019 %>% select("location_name","year","sex_name","age_name","measure_name","metric_name","val","upper","lower") %>% filter(location_name == "Global", sex_name %in% c("Both","Male","Female"),#这是与上一个部分的不同 age_name == "Age-standardized", measure_name == "Deaths", metric_name == 'Rate') Global_threesex_Deaths_ASR$sex_name <- fct_relevel(Global_threesex_Deaths_ASR$sex_name,c("Both","Male","Female")) ##3.3 可视化 B <- ggplot()+ geom_point(data=Global_threesex_Deaths_ASR,aes(x = year, y = val,group = sex_name,color=sex_name,shape = sex_name))+ geom_line(data=Global_threesex_Deaths_ASR,aes(x = year, y = val,group = sex_name,color=sex_name),linewidth=1)+ scale_y_continuous(breaks = c(0.6,0.8,1.0,1.2,1.4,1.6),limits = c(0.5,1.7))+ scale_color_manual(values = c("Both" = "#107B3A", "Male" = "#EC5E17", "Female" = "#043391"))+ #设置颜色 scale_shape_manual(values = c("Both" = 2, "Male" = 1, "Female" = 15)) + # 设置形状 xlab("Year")+ ylab("ASMR")+ theme(panel.background = element_rect(fill = "transparent"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",linewidth = 0.5), axis.text.x = element_text(colour = 'black',size = 6), axis.text.y = element_text(colour = 'black',size = 6), axis.title.x = element_text(colour = 'black',size = 8), axis.title.y = element_text(colour = 'black',size = 8), legend.position = 'bottom', legend.key = element_blank(), legend.title = element_blank()) ##2.4 导出 ggsave(B, file = "./output/PAD_Global_threesex_1990_to_2019_death_ASR.pdf", width = 10, height = 8, units = "cm") ``` #4.求一个location, 一个population(sex&age),1990-2019 + 2020-2030的一个measure和metric的数据 #4.1数据导入 ```{R} ##4.1.1 疾病文件(1990-2019) PAD_Global_5SDI_1990_to_2019 <- read.csv("./data/PAD_Global_5SDI_1990_to_2019.csv",header = T) ##4.1.2 当前人口文件(195countries_5SDI_Global) GBD_population_current <- read.csv('./data/GBD_population_1990to2019.csv',header = T) GBD_population_current <- GBD_population_current %>% select("location_name", "sex_name", "year_id","age_group_name", "val") GBD_population_current <- GBD_population_current %>% distinct(across(c(location_name, sex_name, year_id, age_group_name)), .keep_all = TRUE) ##4.1.3 预测人口文件(195countries_5SDI_Global GBD_population_prediction <- read.csv('./data/GBD_population_2019to2100.csv',header = T) GBD_population_prediction <- GBD_population_prediction %>% select("location_name", "sex", "year_id","age_group_name", "val") ##4.1.4 标准人口,各年龄组比例 age_stand <- read.csv('./data/GBD2019 world population age standard.csv', header = T) ##4.1.5 Order文件 order <- readxl::read_xlsx('./data/2.orders.xlsx', sheet = 3) ``` #4.2 针对疾病数据的清洗,需要个性化 ```{R} ##4.2.1 age_name列的字符串修改 PAD_Global_5SDI_1990_to_2019$age_name <- str_split(PAD_Global_5SDI_1990_to_2019$age_name, ' ', simplify = T)[,1] #字符串拆分 PAD_Global_5SDI_1990_to_2019$age_name <- str_replace(PAD_Global_5SDI_1990_to_2019$age_name, '-',' to ') #字符串替换 PAD_Global_5SDI_1990_to_2019$age_name <- str_replace(PAD_Global_5SDI_1990_to_2019$age_name, '\\+',' plus') #字符串替换 PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == 'All')] <- 'All age_names' PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == '<1')] <- '<1 year' PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == '<5')] <- 'Under 5' ##4.2.2 列与行的筛选(select & filter) data_all_select <- PAD_Global_5SDI_1990_to_2019 %>% select("location_name","year","sex_name","age_name","measure_name","metric_name","val","upper","lower") %>% filter(location_name == 'Global',#根据需要来修改 sex_name == "Both",#根据需要来修改 age_name %in% c("Under 5","5 to 9" ,"10 to 14", "15 to 19","20 to 24","25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"), measure_name == "Deaths", #根据需要来修改 metric_name == 'Number') ##观察数据框“data_all_select“可知:TARGET_DISEASE death在40岁以下的人群是没有数据的,但是40岁以下的数据对于预测来说是必要的,因此需要用0来补全 ##4.2.3 整理年份year(行) * 年龄分组age_name(列)的矩阵,即30行*20列 data_all_select_n <- dcast(data = data_all_select, year~age_name, value.var = "val") %>% column_to_rownames(var = 'year') %>% mutate('0 to 4' = 0, '5 to 9' = 0, '10 to 14' = 0, '15 to 19' = 0, "20 to 24" = 0, "25 to 29" = 0, "30 to 34" = 0, "35 to 39" =0) %>% dplyr::select('0 to 4', '5 to 9', '10 to 14', '15 to 19','20 to 24','25 to 29', "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus") %>% apply(., 2, round) %>% #对数据框 data_all_select_n 中的所有列应用 round 函数,以将列中的数值四舍五入 as.data.frame() #总结:上述内容中,需要个性化修改的部分:4.2中filter的location_name,sex_name,measure_name;4.3中的mutate对应的年龄段 ``` #4.3 针对预测人口数(2020-2030)的个性化数据清洗,需要个性化(主要是预测的终点年份) ```{R} GBD_population_prediction <- GBD_population_prediction %>% select("location_name","sex","year_id","age_group_name","val") %>% filter(age_group_name %in% c("<1 year","1 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"), year_id %in% (2019+1):2030) %>% dplyr::rename(sex_name = sex) #为了后面与“Current_GBD_Population"数据框合并 #总结:上述内容中,需要个性化修改的部分:2019+1和2030,二者分别是预测的起点和终点 ``` #4.4 当前人口和预测人口合并(1990-2030 = 1990-2019 & 2020-2030),需要个性化 ```{R} ## 4.6.1 rbind函数将Current_GBD_Population与GBD_population_prediction合并(有21个年龄组,没有“0 to 4",只有"<1 year"和"1 to 4") GBD <- rbind(GBD_population_current,GBD_population_prediction) ##4,6.2 将"<1 year"和"1 to 4"合并为“0 to 4" GBD_age4 <- GBD %>% subset(age_group_name %in% c("<1 year","1 to 4")) %>% group_by(location_name,sex_name,year_id) %>% summarize(val=sum(val)) %>% mutate(age_group_name = '0 to 4') %>% dplyr::select(1:3,5,4) ##4.6.3 将4.6.1中的5岁以上的数据和4.6.2中“0 to 4"的数据合并 GBD <- subset(GBD, age_group_name %in% c( "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus")) %>% rbind(., GBD_age4) %>% mutate(age_group_name=fct_relevel(age_group_name,c("0 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"))) %>% #为后续分析做铺垫 arrange(age_group_name) ##4.6.4 最后审查人口学数据文件(对每一列审核,可以用unique(GBD$列名),如unique(GBD$sex_name)),这个GBD数据框可以保存,后期直接使用 GBD$sex_name[which(GBD$sex_name == 'both')] <- 'Both' ##4.6.5 调整人口学数据,整合成BAPC可以识别的数据结构:年份year(行) * 年龄分组age_group(列) GBD_select <- GBD %>% filter(location_name == 'Global', #根据需要调整 sex_name == "Both")#根据需要调整 GBD_select_n <- dcast(data = unique(GBD_select), year_id ~ age_group_name,value.var = c("val")) %>% column_to_rownames(var = 'year_id') ##4.6.6 获取标准人口结构数据,并调整数据结构。<1和0-4岁合并成一组 age_stand <- age_stand %>% filter(age %in% c("<1 year","1 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59","60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus")) wstand <- c(age_stand$std_population[1:2] %>% as.numeric() %>% sum(), age_stand$std_population[3:21] %>% as.numeric())/sum(age_stand$std_population[1:21]) #sum(age_stand$std_population[1:21])=100 ##4.6.7 整合历史数据和预测数据 data_all_pro <- matrix(data = NA, nrow = 2030 - 2019, ncol = ncol(GBD_select_n)) %>% as.data.frame() %>% `rownames<-` (.,seq(2019+1,2030,1)) %>% `colnames<-`(.,colnames(data_all_select_n)) #注意不是引号,是反引号,调用函数。最后的两排也可以用rownames(data_all_pro) <- seq(2019+1,2030,1)和colnames(data_all_pro) <- names(data_all_select_n) data_all_select_1990_to_2030 <- rbind(data_all_select_n, data_all_pro) a <- data_all_select_1990_to_2030 b <- GBD_select_n esoph <- APCList(a, b, gf = 5) bapc_result <- BAPC(esoph, predict = list(npredict = 10, retro = T), secondDiff = FALSE, stdweight = wstand, verbose = T) BAPC_list<- bapc_result #总结:上述内容中,需要个性化修改的部分:4.6.5 filter的location_name和sex_name;4.6.7 中的两个数字2030和2019 ``` #4.5-1可视化-rate ```{R} probs = c(0.002,seq(0.004, 0.996, by = 0.002), 0.998) newcolnames <- paste(probs, "Q", sep = "") #### ASMR #### 画图数据 data_plot_all <- data.frame(matrix(data = NA, nrow = 0, ncol = 6)) APCList <- qapc(BAPC_list, percentiles = probs) J <- nperiod(APCList) plab <- periodlabels(APCList) my.wm <- matrix(rep(stdweight(APCList), each = J), byrow = F, nrow = J) st <- 1990 - 1989 agg.n <- rowSums(pyrs(APCList)) scale <- 10^5 #根据单位来修改,目前的单位是“/100000” data_forplot <- t(agestd.proj(APCList)[st:J, newcolnames]/agg.n[st:J] * scale) %>% as.data.frame()%>% rownames_to_column('quantile') %>% pivot_longer(cols = colnames(.)[-1], names_to = 'year') %>% dplyr::mutate(year = as.integer(year)) %>% dplyr::mutate(sub = 'Global') #根据具体情况修改名称 median <- data_forplot[data_forplot$quantile == '0.5Q',c('year', 'value')] %>% dplyr::rename('median_data' = 'value') y <- epi(APCList) n <- pyrs(APCList) data_forplot <- left_join(data_forplot, median, by = 'year') data_forplot$median_data2 <- ifelse(data_forplot$year > 2019, NA, data_forplot$median_data) data_plot_all <- rbind(data_plot_all,data_forplot) color_forpoint = c('Global' = '#66c2a5', 'High SDI' = '#fc8d62', 'High-middle SDI' = '#8da0cb', 'Low SDI' = '#e78ac3', 'Low-middle SDI' = '#a6d854', 'Middle SDI' = '#ffd92f') shape_forpoint = c('Global' = 15, 'High SDI' = 16, 'High-middle SDI' = 17, 'Low SDI' = 18, 'Low-middle SDI' = 25, 'Middle SDI' = 8) #color_forpoint和shape_forpiont都是根据需要来修改 color_forfan = '#eff3ff' plot <- ggplot(data_plot_all, aes(x=year,y=value, group = sub))+ geom_fan()+scale_fill_gradientn(colours = rep(color_forfan,2), guide = FALSE) + geom_point( aes(x = year, y = median_data2, color = sub,shape = sub))+ geom_line(aes(x = year, y = median_data, color = sub))+ scale_color_manual(values = color_forpoint)+ scale_shape_manual(values = shape_forpoint)+ scale_y_continuous(breaks = c(0.6,0.8,1.0,1.2,1.4,1.6,1.8),limits = c(0.5,1.8))+ geom_vline(xintercept = 2019,linetype = 2)+ ylab('ASMR (per 100,000)') + xlab('Year')+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(fill = "transparent"), plot.background = element_rect(fill = "transparent"), axis.line = element_line(colour = "black"), text = element_text(size = 8), axis.text = element_text(colour = 'black'), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), legend.key = element_blank(), legend.position = 'bottom', legend.title = element_blank()) #ggsave(plot, file="./output/Global_bothsex_1990-2030_ASMR.pdf",units = "cm", width = 10,height = 8) #上述内容中,需要个性化修改的地方,都有备注 ``` #4.5-2可视化-number ```{R} data_plot_all <- data.frame(matrix(data = NA, nrow = 0, ncol = 6)) probs = c(0.002,seq(0.004, 0.996, by = 0.002), 0.998) newcolnames <- paste(probs, "Q", sep = "") APCList <- qapc(BAPC_list, percentiles = probs) J <- nperiod(APCList) st <- 1990 - 1989 scale <- 1 data_forplot <- t(agestd.proj(APCList)[st:J, newcolnames]/ scale) %>% as.data.frame() %>% rownames_to_column('quantile') %>% pivot_longer(cols = colnames(.)[-1], names_to = 'year') %>% dplyr::mutate(year = as.integer(year)) %>% dplyr::mutate(sub = 'Global') median <- data_forplot[data_forplot$quantile == '0.5Q',c('year', 'value')] %>% dplyr::rename('median_data' = 'value') y <- epi(APCList) n <- pyrs(APCList) data_forplot <- left_join(data_forplot, median, by = 'year') data_forplot$median_data2 <- ifelse(data_forplot$year > 2019, NA, data_forplot$median_data) data_plot_all <- rbind(data_plot_all,data_forplot) color_forpoint = c('Global' = '#66c2a5', 'High SDI' = '#fc8d62', 'High-middle SDI' = '#8da0cb', 'Low SDI' = '#e78ac3', 'Low-middle SDI' = '#a6d854', 'Middle SDI' = '#ffd92f') shape_forpoint = c('Global' = 15, 'High SDI' = 16, 'High-middle SDI' = 17, 'Low SDI' = 18, 'Low-middle SDI' = 25, 'Middle SDI' = 8) color_forfan = '#eff3ff' plot <- ggplot(data_plot_all, aes(x=year,y=value, group = sub))+ geom_fan()+scale_fill_gradientn(colours = rep(color_forfan,2), guide = FALSE) + geom_point( aes(x = year, y = median_data2, color = sub,shape = sub))+ geom_line(aes(x = year, y = median_data, color = sub))+ scale_color_manual(values = color_forpoint)+ scale_shape_manual(values = shape_forpoint)+ #scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+ scale_y_continuous(breaks = c(0,20000,40000,60000,80000,100000,120000),limits = c(0,125000))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(fill = "transparent"), plot.background = element_rect(fill = "transparent"), axis.line = element_line(colour = "black"), text = element_text(size = 8), axis.text = element_text(colour = 'black'), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), legend.key = element_blank(), legend.position = 'bottom', legend.title = element_blank()) + geom_vline(xintercept = 2019,linetype = 2)+ ylab('Number') + xlab('Year') ggsave(plot, file="./output/Global_bothsex_1990-2030_Deaths_number.pdf",units = "cm", width = 10,height = 8) ``` #5.求一个location, 多个population(sex&age),1990-2019 + 2020-2030的一个measure和metric的数据(举例:Global,3种sex,Deaths,rate) #5A.针对Male的数据清洗 #5A.1数据导入(同4.1) ```{R} ##4.1.1 疾病文件(1990-2019) PAD_Global_5SDI_1990_to_2019 <- read.csv("./data/PAD_Global_5SDI_1990_to_2019.csv",header = T) ##4.1.2 当前人口文件(195countries_5SDI_Global) GBD_population_current <- read.csv('./data/GBD_population_1990to2019.csv',header = T) GBD_population_current <- GBD_population_current %>% select("location_name", "sex_name", "year_id","age_group_name", "val") GBD_population_current <- GBD_population_current %>% distinct(across(c(location_name, sex_name, year_id, age_group_name)), .keep_all = TRUE) ##4.1.3 预测人口文件(195countries_5SDI_Global GBD_population_prediction <- read.csv('./data/GBD_population_2019to2100.csv',header = T) GBD_population_prediction <- GBD_population_prediction %>% select("location_name", "sex", "year_id","age_group_name", "val") ##4.1.4 标准人口,各年龄组比例 age_stand <- read.csv('./data/GBD2019 world population age standard.csv', header = T) ##4.1.5 Order文件 order <- readxl::read_xlsx('./data/2.orders.xlsx', sheet = 3) ``` #5A.2 针对疾病数据的清洗,需要个性化 ```{R} ##4.2.1 age_name列的字符串修改 PAD_Global_5SDI_1990_to_2019$age_name <- str_split(PAD_Global_5SDI_1990_to_2019$age_name, ' ', simplify = T)[,1] #字符串拆分 PAD_Global_5SDI_1990_to_2019$age_name <- str_replace(PAD_Global_5SDI_1990_to_2019$age_name, '-',' to ') #字符串替换 PAD_Global_5SDI_1990_to_2019$age_name <- str_replace(PAD_Global_5SDI_1990_to_2019$age_name, '\\+',' plus') #字符串替换 PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == 'All')] <- 'All age_names' PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == '<1')] <- '<1 year' PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == '<5')] <- 'Under 5' ##4.2.2 列与行的筛选(select & filter) data_all_select <- PAD_Global_5SDI_1990_to_2019 %>% select("location_name","year","sex_name","age_name","measure_name","metric_name","val","upper","lower") %>% filter(location_name == 'Global',#根据需要来修改 sex_name == "Male",#根据需要来修改 age_name %in% c("Under 5","5 to 9" ,"10 to 14", "15 to 19","20 to 24","25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"), measure_name == "Deaths", #根据需要来修改 metric_name == 'Number') ##观察数据框“data_all_select“可知:TARGET_DISEASE death在40岁以下的人群是没有数据的,但是40岁以下的数据对于预测来说是必要的,因此需要用0来补全 ##4.2.3 整理年份year(行) * 年龄分组age_name(列)的矩阵,即30行*20列 data_all_select_n <- dcast(data = data_all_select, year~age_name, value.var = "val") %>% column_to_rownames(var = 'year') %>% mutate('0 to 4' = 0, '5 to 9' = 0, '10 to 14' = 0, '15 to 19' = 0, "20 to 24" = 0, "25 to 29" = 0, "30 to 34" = 0, "35 to 39" =0) %>% dplyr::select('0 to 4', '5 to 9', '10 to 14', '15 to 19','20 to 24','25 to 29', "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus") %>% apply(., 2, round) %>% #对数据框 data_all_select_n 中的所有列应用 round 函数,以将列中的数值四舍五入 as.data.frame() #总结:上述内容中,需要个性化修改的部分:4.2中filter的location_name,sex_name,measure_name;4.3中的mutate对应的年龄段 ``` #5A.3 针对预测人口数(2020-2030)的个性化数据清洗,需要个性化(主要是预测的终点年份) ```{R} GBD_population_prediction <- GBD_population_prediction %>% select("location_name","sex","year_id","age_group_name","val") %>% filter(age_group_name %in% c("<1 year","1 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"), year_id %in% (2019+1):2030) %>% dplyr::rename(sex_name = sex) #为了后面与“Current_GBD_Population"数据框合并 #总结:上述内容中,需要个性化修改的部分:2019+1和2030,二者分别是预测的起点和终点 ``` #5A.4 当前人口和预测人口合并(1990-2030 = 1990-2019 & 2020-2030),需要个性化 ```{R} ## 4.6.1 rbind函数将Current_GBD_Population与GBD_population_prediction合并(有21个年龄组,没有“0 to 4",只有"<1 year"和"1 to 4") GBD <- rbind(GBD_population_current,GBD_population_prediction) ##4,6.2 将"<1 year"和"1 to 4"合并为“0 to 4" GBD_age4 <- GBD %>% subset(age_group_name %in% c("<1 year","1 to 4")) %>% group_by(location_name,sex_name,year_id) %>% summarize(val=sum(val)) %>% mutate(age_group_name = '0 to 4') %>% dplyr::select(1:3,5,4) ##4.6.3 将4.6.1中的5岁以上的数据和4.6.2中“0 to 4"的数据合并 GBD <- subset(GBD, age_group_name %in% c( "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus")) %>% rbind(., GBD_age4) %>% mutate(age_group_name=fct_relevel(age_group_name,c("0 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"))) %>% #为后续分析做铺垫 arrange(age_group_name) ##4.6.4 最后审查人口学数据文件(对每一列审核,可以用unique(GBD$列名),如unique(GBD$sex_name)),这个GBD数据框可以保存,后期直接使用 GBD$sex_name[which(GBD$sex_name == 'both')] <- 'Both' ##4.6.5 调整人口学数据,整合成BAPC可以识别的数据结构:年份year(行) * 年龄分组age_group(列) GBD_select <- GBD %>% filter(location_name == 'Global', #根据需要调整 sex_name == "Male")#根据需要调整 GBD_select_n <- dcast(data = unique(GBD_select), year_id ~ age_group_name,value.var = c("val")) %>% column_to_rownames(var = 'year_id') ##4.6.6 获取标准人口结构数据,并调整数据结构。<1和0-4岁合并成一组 age_stand <- age_stand %>% filter(age %in% c("<1 year","1 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59","60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus")) wstand <- c(age_stand$std_population[1:2] %>% as.numeric() %>% sum(), age_stand$std_population[3:21] %>% as.numeric())/sum(age_stand$std_population[1:21]) #sum(age_stand$std_population[1:21])=100 ##4.6.7 整合历史数据和预测数据 data_all_pro <- matrix(data = NA, nrow = 2030 - 2019, ncol = ncol(GBD_select_n)) %>% as.data.frame() %>% `rownames<-` (.,seq(2019+1,2030,1)) %>% `colnames<-`(.,colnames(data_all_select_n)) #注意不是引号,是反引号,调用函数。最后的两排也可以用rownames(data_all_pro) <- seq(2019+1,2030,1)和colnames(data_all_pro) <- names(data_all_select_n) data_all_select_1990_to_2030 <- rbind(data_all_select_n, data_all_pro) a <- data_all_select_1990_to_2030 b <- GBD_select_n esoph <- APCList(a, b, gf = 5) bapc_result <- BAPC(esoph, predict = list(npredict = 10, retro = T), secondDiff = FALSE, stdweight = wstand, verbose = T) BAPC_list<- bapc_result #总结:上述内容中,需要个性化修改的部分:4.6.5 filter的location_name和sex_name;4.6.7 中的两个数字2030和2019 ``` #5C.5 作图数据准备,需个性化 ```{R} probs = c(0.002,seq(0.004, 0.996, by = 0.002), 0.998) newcolnames <- paste(probs, "Q", sep = "") data_plot_all <- data.frame(matrix(data = NA, nrow = 0, ncol = 6)) APCList <- qapc(BAPC_list, percentiles = probs) J <- nperiod(APCList) plab <- periodlabels(APCList) my.wm <- matrix(rep(stdweight(APCList), each = J), byrow = F, nrow = J) st <- 1990 - 1989 agg.n <- rowSums(pyrs(APCList)) scale <- 10^5 #根据单位来修改,目前的单位是“/100000” data_forplot <- t(agestd.proj(APCList)[st:J, newcolnames]/agg.n[st:J] * scale) %>% as.data.frame()%>% rownames_to_column('quantile') %>% pivot_longer(cols = colnames(.)[-1], names_to = 'year') %>% dplyr::mutate(year = as.integer(year)) %>% dplyr::mutate(sub = 'Male') #根据具体情况修改名称 median <- data_forplot[data_forplot$quantile == '0.5Q',c('year', 'value')] %>% dplyr::rename('median_data' = 'value') y <- epi(APCList) n <- pyrs(APCList) data_forplot <- left_join(data_forplot, median, by = 'year') data_forplot$median_data2 <- ifelse(data_forplot$year > 2019, NA, data_forplot$median_data) data_plot_all_Male <- rbind(data_plot_all,data_forplot) #得到了一个关键的作图数据框:data_plot_all_Male ``` #5B.针对Female的数据清洗 #5B.1数据导入(同4.1) ```{R} ##4.1.1 疾病文件(1990-2019) PAD_Global_5SDI_1990_to_2019 <- read.csv("./data/PAD_Global_5SDI_1990_to_2019.csv",header = T) ##4.1.2 当前人口文件(195countries_5SDI_Global) GBD_population_current <- read.csv('./data/GBD_population_1990to2019.csv',header = T) GBD_population_current <- GBD_population_current %>% select("location_name", "sex_name", "year_id","age_group_name", "val") GBD_population_current <- GBD_population_current %>% distinct(across(c(location_name, sex_name, year_id, age_group_name)), .keep_all = TRUE) ##4.1.3 预测人口文件(195countries_5SDI_Global GBD_population_prediction <- read.csv('./data/GBD_population_2019to2100.csv',header = T) GBD_population_prediction <- GBD_population_prediction %>% select("location_name", "sex", "year_id","age_group_name", "val") ##4.1.4 标准人口,各年龄组比例 age_stand <- read.csv('./data/GBD2019 world population age standard.csv', header = T) ##4.1.5 Order文件 order <- readxl::read_xlsx('./data/2.orders.xlsx', sheet = 3) ``` #5B.2 针对疾病数据的清洗,需要个性化 ```{R} ##4.2.1 age_name列的字符串修改 PAD_Global_5SDI_1990_to_2019$age_name <- str_split(PAD_Global_5SDI_1990_to_2019$age_name, ' ', simplify = T)[,1] #字符串拆分 PAD_Global_5SDI_1990_to_2019$age_name <- str_replace(PAD_Global_5SDI_1990_to_2019$age_name, '-',' to ') #字符串替换 PAD_Global_5SDI_1990_to_2019$age_name <- str_replace(PAD_Global_5SDI_1990_to_2019$age_name, '\\+',' plus') #字符串替换 PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == 'All')] <- 'All age_names' PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == '<1')] <- '<1 year' PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == '<5')] <- 'Under 5' ##4.2.2 列与行的筛选(select & filter) data_all_select <- PAD_Global_5SDI_1990_to_2019 %>% select("location_name","year","sex_name","age_name","measure_name","metric_name","val","upper","lower") %>% filter(location_name == 'Global',#根据需要来修改 sex_name == "Female",#根据需要来修改 age_name %in% c("Under 5","5 to 9" ,"10 to 14", "15 to 19","20 to 24","25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"), measure_name == "Deaths", #根据需要来修改 metric_name == 'Number') ##观察数据框“data_all_select“可知:TARGET_DISEASE death在40岁以下的人群是没有数据的,但是40岁以下的数据对于预测来说是必要的,因此需要用0来补全 ##4.2.3 整理年份year(行) * 年龄分组age_name(列)的矩阵,即30行*20列 data_all_select_n <- dcast(data = data_all_select, year~age_name, value.var = "val") %>% column_to_rownames(var = 'year') %>% mutate('0 to 4' = 0, '5 to 9' = 0, '10 to 14' = 0, '15 to 19' = 0, "20 to 24" = 0, "25 to 29" = 0, "30 to 34" = 0, "35 to 39" =0) %>% dplyr::select('0 to 4', '5 to 9', '10 to 14', '15 to 19','20 to 24','25 to 29', "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus") %>% apply(., 2, round) %>% #对数据框 data_all_select_n 中的所有列应用 round 函数,以将列中的数值四舍五入 as.data.frame() #总结:上述内容中,需要个性化修改的部分:4.2中filter的location_name,sex_name,measure_name;4.3中的mutate对应的年龄段 ``` #5B.3 针对预测人口数(2020-2030)的个性化数据清洗,需要个性化(主要是预测的终点年份) ```{R} GBD_population_prediction <- GBD_population_prediction %>% select("location_name","sex","year_id","age_group_name","val") %>% filter(age_group_name %in% c("<1 year","1 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"), year_id %in% (2019+1):2030) %>% dplyr::rename(sex_name = sex) #为了后面与“Current_GBD_Population"数据框合并 #总结:上述内容中,需要个性化修改的部分:2019+1和2030,二者分别是预测的起点和终点 ``` #5B.4 当前人口和预测人口合并(1990-2030 = 1990-2019 & 2020-2030),需要个性化 ```{R} ## 4.6.1 rbind函数将Current_GBD_Population与GBD_population_prediction合并(有21个年龄组,没有“0 to 4",只有"<1 year"和"1 to 4") GBD <- rbind(GBD_population_current,GBD_population_prediction) ##4,6.2 将"<1 year"和"1 to 4"合并为“0 to 4" GBD_age4 <- GBD %>% subset(age_group_name %in% c("<1 year","1 to 4")) %>% group_by(location_name,sex_name,year_id) %>% summarize(val=sum(val)) %>% mutate(age_group_name = '0 to 4') %>% dplyr::select(1:3,5,4) ##4.6.3 将4.6.1中的5岁以上的数据和4.6.2中“0 to 4"的数据合并 GBD <- subset(GBD, age_group_name %in% c( "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus")) %>% rbind(., GBD_age4) %>% mutate(age_group_name=fct_relevel(age_group_name,c("0 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"))) %>% #为后续分析做铺垫 arrange(age_group_name) ##4.6.4 最后审查人口学数据文件(对每一列审核,可以用unique(GBD$列名),如unique(GBD$sex_name)),这个GBD数据框可以保存,后期直接使用 GBD$sex_name[which(GBD$sex_name == 'both')] <- 'Both' ##4.6.5 调整人口学数据,整合成BAPC可以识别的数据结构:年份year(行) * 年龄分组age_group(列) GBD_select <- GBD %>% filter(location_name == 'Global', #根据需要调整 sex_name == "Female")#根据需要调整 GBD_select_n <- dcast(data = unique(GBD_select), year_id ~ age_group_name,value.var = c("val")) %>% column_to_rownames(var = 'year_id') ##4.6.6 获取标准人口结构数据,并调整数据结构。<1和0-4岁合并成一组 age_stand <- age_stand %>% filter(age %in% c("<1 year","1 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59","60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus")) wstand <- c(age_stand$std_population[1:2] %>% as.numeric() %>% sum(), age_stand$std_population[3:21] %>% as.numeric())/sum(age_stand$std_population[1:21]) #sum(age_stand$std_population[1:21])=100 ##4.6.7 整合历史数据和预测数据 data_all_pro <- matrix(data = NA, nrow = 2030 - 2019, ncol = ncol(GBD_select_n)) %>% as.data.frame() %>% `rownames<-` (.,seq(2019+1,2030,1)) %>% `colnames<-`(.,colnames(data_all_select_n)) #注意不是引号,是反引号,调用函数。最后的两排也可以用rownames(data_all_pro) <- seq(2019+1,2030,1)和colnames(data_all_pro) <- names(data_all_select_n) data_all_select_1990_to_2030 <- rbind(data_all_select_n, data_all_pro) a <- data_all_select_1990_to_2030 b <- GBD_select_n esoph <- APCList(a, b, gf = 5) bapc_result <- BAPC(esoph, predict = list(npredict = 10, retro = T), secondDiff = FALSE, stdweight = wstand, verbose = T) BAPC_list<- bapc_result #总结:上述内容中,需要个性化修改的部分:4.6.5 filter的location_name和sex_name;4.6.7 中的两个数字2030和2019 ``` #5B.5 作图数据准备,需个性化 ```{R} probs = c(0.002,seq(0.004, 0.996, by = 0.002), 0.998) newcolnames <- paste(probs, "Q", sep = "") data_plot_all <- data.frame(matrix(data = NA, nrow = 0, ncol = 6)) APCList <- qapc(BAPC_list, percentiles = probs) J <- nperiod(APCList) plab <- periodlabels(APCList) my.wm <- matrix(rep(stdweight(APCList), each = J), byrow = F, nrow = J) st <- 1990 - 1989 agg.n <- rowSums(pyrs(APCList)) scale <- 10^5 #根据单位来修改,目前的单位是“/100000” data_forplot <- t(agestd.proj(APCList)[st:J, newcolnames]/agg.n[st:J] * scale) %>% as.data.frame()%>% rownames_to_column('quantile') %>% pivot_longer(cols = colnames(.)[-1], names_to = 'year') %>% dplyr::mutate(year = as.integer(year)) %>% dplyr::mutate(sub = 'Female') #根据具体情况修改名称 median <- data_forplot[data_forplot$quantile == '0.5Q',c('year', 'value')] %>% dplyr::rename('median_data' = 'value') y <- epi(APCList) n <- pyrs(APCList) data_forplot <- left_join(data_forplot, median, by = 'year') data_forplot$median_data2 <- ifelse(data_forplot$year > 2019, NA, data_forplot$median_data) data_plot_all_Female <- rbind(data_plot_all,data_forplot) #得到了一个关键的作图数据框:data_plot_all_Female ``` #5C.针对Both的数据清洗 #5C.1数据导入(同4.1) ```{R} ##4.1.1 疾病文件(1990-2019) PAD_Global_5SDI_1990_to_2019 <- read.csv("./data/PAD_Global_5SDI_1990_to_2019.csv",header = T) ##4.1.2 当前人口文件(195countries_5SDI_Global) GBD_population_current <- read.csv('./data/GBD_population_1990to2019.csv',header = T) GBD_population_current <- GBD_population_current %>% select("location_name", "sex_name", "year_id","age_group_name", "val") GBD_population_current <- GBD_population_current %>% distinct(across(c(location_name, sex_name, year_id, age_group_name)), .keep_all = TRUE) ##4.1.3 预测人口文件(195countries_5SDI_Global GBD_population_prediction <- read.csv('./data/GBD_population_2019to2100.csv',header = T) GBD_population_prediction <- GBD_population_prediction %>% select("location_name", "sex", "year_id","age_group_name", "val") ##4.1.4 标准人口,各年龄组比例 age_stand <- read.csv('./data/GBD2019 world population age standard.csv', header = T) ##4.1.5 Order文件 order <- readxl::read_xlsx('./data/2.orders.xlsx', sheet = 3) ``` #5C.2 针对疾病数据的清洗,需要个性化 ```{R} ##4.2.1 age_name列的字符串修改 PAD_Global_5SDI_1990_to_2019$age_name <- str_split(PAD_Global_5SDI_1990_to_2019$age_name, ' ', simplify = T)[,1] #字符串拆分 PAD_Global_5SDI_1990_to_2019$age_name <- str_replace(PAD_Global_5SDI_1990_to_2019$age_name, '-',' to ') #字符串替换 PAD_Global_5SDI_1990_to_2019$age_name <- str_replace(PAD_Global_5SDI_1990_to_2019$age_name, '\\+',' plus') #字符串替换 PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == 'All')] <- 'All age_names' PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == '<1')] <- '<1 year' PAD_Global_5SDI_1990_to_2019$age_name[which(PAD_Global_5SDI_1990_to_2019$age_name == '<5')] <- 'Under 5' ##4.2.2 列与行的筛选(select & filter) data_all_select <- PAD_Global_5SDI_1990_to_2019 %>% select("location_name","year","sex_name","age_name","measure_name","metric_name","val","upper","lower") %>% filter(location_name == 'Global',#根据需要来修改 sex_name == "Both",#根据需要来修改 age_name %in% c("Under 5","5 to 9" ,"10 to 14", "15 to 19","20 to 24","25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"), measure_name == "Deaths", #根据需要来修改 metric_name == 'Number') ##观察数据框“data_all_select“可知:TARGET_DISEASE death在40岁以下的人群是没有数据的,但是40岁以下的数据对于预测来说是必要的,因此需要用0来补全 ##4.2.3 整理年份year(行) * 年龄分组age_name(列)的矩阵,即30行*20列 data_all_select_n <- dcast(data = data_all_select, year~age_name, value.var = "val") %>% column_to_rownames(var = 'year') %>% mutate('0 to 4' = 0, '5 to 9' = 0, '10 to 14' = 0, '15 to 19' = 0, "20 to 24" = 0, "25 to 29" = 0, "30 to 34" = 0, "35 to 39" =0) %>% dplyr::select('0 to 4', '5 to 9', '10 to 14', '15 to 19','20 to 24','25 to 29', "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus") %>% apply(., 2, round) %>% #对数据框 data_all_select_n 中的所有列应用 round 函数,以将列中的数值四舍五入 as.data.frame() #总结:上述内容中,需要个性化修改的部分:4.2中filter的location_name,sex_name,measure_name;4.3中的mutate对应的年龄段 ``` #5C.3 针对预测人口数(2020-2030)的个性化数据清洗,需要个性化(主要是预测的终点年份) ```{R} GBD_population_prediction <- GBD_population_prediction %>% select("location_name","sex","year_id","age_group_name","val") %>% filter(age_group_name %in% c("<1 year","1 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"), year_id %in% (2019+1):2030) %>% dplyr::rename(sex_name = sex) #为了后面与“Current_GBD_Population"数据框合并 #总结:上述内容中,需要个性化修改的部分:2019+1和2030,二者分别是预测的起点和终点 ``` #5C.4 当前人口和预测人口合并(1990-2030 = 1990-2019 & 2020-2030),需要个性化 ```{R} ## 4.6.1 rbind函数将Current_GBD_Population与GBD_population_prediction合并(有21个年龄组,没有“0 to 4",只有"<1 year"和"1 to 4") GBD <- rbind(GBD_population_current,GBD_population_prediction) ##4,6.2 将"<1 year"和"1 to 4"合并为“0 to 4" GBD_age4 <- GBD %>% subset(age_group_name %in% c("<1 year","1 to 4")) %>% group_by(location_name,sex_name,year_id) %>% summarize(val=sum(val)) %>% mutate(age_group_name = '0 to 4') %>% dplyr::select(1:3,5,4) ##4.6.3 将4.6.1中的5岁以上的数据和4.6.2中“0 to 4"的数据合并 GBD <- subset(GBD, age_group_name %in% c( "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus")) %>% rbind(., GBD_age4) %>% mutate(age_group_name=fct_relevel(age_group_name,c("0 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus"))) %>% #为后续分析做铺垫 arrange(age_group_name) ##4.6.4 最后审查人口学数据文件(对每一列审核,可以用unique(GBD$列名),如unique(GBD$sex_name)),这个GBD数据框可以保存,后期直接使用 GBD$sex_name[which(GBD$sex_name == 'both')] <- 'Both' ##4.6.5 调整人口学数据,整合成BAPC可以识别的数据结构:年份year(行) * 年龄分组age_group(列) GBD_select <- GBD %>% filter(location_name == 'Global', #根据需要调整 sex_name == "Both")#根据需要调整 GBD_select_n <- dcast(data = unique(GBD_select), year_id ~ age_group_name,value.var = c("val")) %>% column_to_rownames(var = 'year_id') ##4.6.6 获取标准人口结构数据,并调整数据结构。<1和0-4岁合并成一组 age_stand <- age_stand %>% filter(age %in% c("<1 year","1 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59","60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus")) wstand <- c(age_stand$std_population[1:2] %>% as.numeric() %>% sum(), age_stand$std_population[3:21] %>% as.numeric())/sum(age_stand$std_population[1:21]) #sum(age_stand$std_population[1:21])=100 ##4.6.7 整合历史数据和预测数据 data_all_pro <- matrix(data = NA, nrow = 2030 - 2019, ncol = ncol(GBD_select_n)) %>% as.data.frame() %>% `rownames<-` (.,seq(2019+1,2030,1)) %>% `colnames<-`(.,colnames(data_all_select_n)) #注意不是引号,是反引号,调用函数。最后的两排也可以用rownames(data_all_pro) <- seq(2019+1,2030,1)和colnames(data_all_pro) <- names(data_all_select_n) data_all_select_1990_to_2030 <- rbind(data_all_select_n, data_all_pro) a <- data_all_select_1990_to_2030 b <- GBD_select_n esoph <- APCList(a, b, gf = 5) bapc_result <- BAPC(esoph, predict = list(npredict = 10, retro = T), secondDiff = FALSE, stdweight = wstand, verbose = T) BAPC_list<- bapc_result #总结:上述内容中,需要个性化修改的部分:4.6.5 filter的location_name和sex_name;4.6.7 中的两个数字2030和2019 ``` #5C.5 作图数据准备,需个性化 ```{R} probs = c(0.002,seq(0.004, 0.996, by = 0.002), 0.998) newcolnames <- paste(probs, "Q", sep = "") data_plot_all <- data.frame(matrix(data = NA, nrow = 0, ncol = 6)) APCList <- qapc(BAPC_list, percentiles = probs) J <- nperiod(APCList) plab <- periodlabels(APCList) my.wm <- matrix(rep(stdweight(APCList), each = J), byrow = F, nrow = J) st <- 1990 - 1989 agg.n <- rowSums(pyrs(APCList)) scale <- 10^5 #根据单位来修改,目前的单位是“/100000” data_forplot <- t(agestd.proj(APCList)[st:J, newcolnames]/agg.n[st:J] * scale) %>% as.data.frame()%>% rownames_to_column('quantile') %>% pivot_longer(cols = colnames(.)[-1], names_to = 'year') %>% dplyr::mutate(year = as.integer(year)) %>% dplyr::mutate(sub = 'Both') #根据具体情况修改名称 median <- data_forplot[data_forplot$quantile == '0.5Q',c('year', 'value')] %>% dplyr::rename('median_data' = 'value') y <- epi(APCList) n <- pyrs(APCList) data_forplot <- left_join(data_forplot, median, by = 'year') data_forplot$median_data2 <- ifelse(data_forplot$year > 2019, NA, data_forplot$median_data) data_plot_all_Both <- rbind(data_plot_all,data_forplot) #得到了一个关键的作图数据框:data_plot_all_Both ``` #5.6可视化-rate ```{R} data_plot_all <- rbind(data_plot_all_Both, data_plot_all_Female,data_plot_all_Male) color_forpoint = c('Both' = '#66c2a5', 'Female' = '#fc8d62', 'Male' = '#8da0cb') #'Low SDI' = '#e78ac3', #'Low-middle SDI' = '#a6d854', #'Middle SDI' = '#ffd92f') shape_forpoint = c('Both' = 15, 'Female' = 16, 'Male' = 17) #'Low SDI' = 18, #'Low-middle SDI' = 25, #'Middle SDI' = 8) #color_forpoint和shape_forpiont都是根据需要来修改 color_forfan = '#eff3ff' plot <- ggplot(data_plot_all, aes(x=year,y=value, group = sub))+ geom_fan()+scale_fill_gradientn(colours = rep(color_forfan,2), guide = FALSE) + geom_point( aes(x = year, y = median_data2, color = sub,shape = sub))+ geom_line(aes(x = year, y = median_data, color = sub))+ scale_color_manual(values = color_forpoint)+ scale_shape_manual(values = shape_forpoint)+ scale_y_continuous(breaks = c(0.6,0.8,1.0,1.2,1.4,1.6,1.8),limits = c(0.5,1.8))+ geom_vline(xintercept = 2019,linetype = 2)+ ylab('ASMR (per 100,000)') + xlab('Year')+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(fill = "transparent"), plot.background = element_rect(fill = "transparent"), axis.line = element_line(colour = "black"), text = element_text(size = 8), axis.text = element_text(colour = 'black'), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), legend.key = element_blank(), legend.position = 'bottom', legend.title = element_blank()) #ggsave(plot, file="./output/Global_threesex_1990-2030_ASMR.pdf",units = "cm", width = 10,height = 8) #上述内容中,需要个性化修改的地方,都有备注 ``` #附件1:分析204个国家和预测数据中222个国家的交集 ```{R} #背景:1990-2019年的人口学数据文件是多个文件(文件1,包含Global和SDI),2019年以后的数据文件是一个整合文件(文件2,包含Global,不包含Global),我们想分析的是204个国家(文件3)。存在几个问题:1.文件1和文件2里面的Global可能不是一个Global;2.文件2里面没有SDI;3.文件1、文件2、文件3中的location分别有672,222和204个,其中222是672的子集,但是222和204之间是部分相交。因此最严谨的做法是先明确这204和222个国家的交集,再针对这个交集里面的国家进行分析。 ##明确1990-2019人口学数据中的location(672个) Current_GBD_Population <- read.csv("./data/Current_GBD_Population_SDI_change.csv",header = T) population_file_current <- './data/GBD_Population/' dirname <- dir(population_file_current) file <- paste0(population_file_current,dirname) var_name <- c("location_name","sex_name","year_id","age_group_name","val") GBD_population <- as.data.frame(matrix(nrow=0,ncol=length(var_name))) names(GBD_population)=var_name for (a in file) { data <- fread(a) %>% select(var_name) %>% filter(age_group_name %in% c("<1 year","1 to 4", "5 to 9","10 to 14", "15 to 19","20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 plus")) GBD_population <- rbind(GBD_population,data) } GBD_population$sex_name <- str_to_title(GBD_population$sex_name) #将sex_name这一列的首字母变成大写,如Male, Female, Both GBD_population <- GBD_population[!duplicated(GBD_population),] #去除重复行 GBD_population$location_name[which(GBD_population$location_name == "C么te d'Ivoire")] <- "Côte d'Ivoire" countries_672 <- unique(GBD_population$location_name) ##明确204个国家的list order <- readxl::read_xlsx('./data/2.orders.xlsx', sheet = 3) counties_204 <- order[!order$SDI %in% c('Global', 'SDI5','region21'),1,drop = T] ##明确预测文件中的222个location GBD_population_prediction <- read.csv('./data/population_projection_reference.csv',header = T) GBD_population_prediction$location_name[which(GBD_population_prediction$location_name == "C么te d'Ivoire")]<- "Côte d'Ivoire" GBD_population_prediction <- GBD_population_prediction %>% select("location_name","sex","year_id","age_group_name","val") counties_222 <- unique(GBD_population_prediction$location_name) ##明确上述3个list的关系 setdiff(counties_222,countries_672) #可以得出222是672的子集 setdiff(counties_204,countries_672) #可以得出204是672的子集 setdiff(counties_204,counties_222) #可以得出204不是222的子集,有9个国家的数据在预测数据中是缺失的。二者的交集是204-9=195个国家 ##确定最后分析的国家:195个 counties_included <- intersect(counties_204, counties_222) ``` #附件2:重新生成1990-2019年Global以及SDI的数据,整理总的人口学数据 ```{R} ### 提取GBD_population中195(204-9)个国家的数据,手动生成5个SDI及全球人口数 GBD_population <- GBD_population[GBD_population $location_name %in% counties_included,] #length(unique(GBD_population$location_name)) glo_current <- GBD_population %>% group_by(sex_name, year_id, age_group_name) %>% summarise(val = sum(val)) %>% mutate('location_name' = 'Global') %>% select(colnames(GBD_population)) for (i in c('High SDI','High-middle SDI','Middle SDI','Low-middle SDI','Low SDI')){ country <- order[order$SDI == i, 'Country', drop =T] num <- GBD_population %>% dplyr::filter(location_name %in% country) %>% group_by(sex_name, year_id, age_group_name) %>% summarise(val = sum(val)) %>% mutate('location_name' = i) GBD_population <- rbind(GBD_population, num) } GBD_population_1990to2019 <- rbind(GBD_population,glo_current) %>% select("location_name", "sex_name", "year_id","age_group_name", "val") write.csv(GBD_population_1990to2019, file="./data/GBD_population_1990to2019.csv") ``` #附件3:重新生成2019年以后的sex_both数据、Global以及SDI的数据,整理总的人口学数据 ```{R} order <- readxl::read_xlsx('./data/2.orders.xlsx', sheet = 3) cou_included <- order[!order$SDI %in% c('Global', 'SDI5','region21'),1,drop = T] prediction_var_name <- c("location_name","sex","year_id","age_group_name","val") GBD_population_prediction <- fread('./data/population_projection_reference.csv') %>% select(prediction_var_name) GBD_population_prediction[which(GBD_population_prediction$location_name == 'C么te d\'Ivoire'),'location_name'] <- "Côte d\'Ivoire" ## GBD_population_prediction中目前无Both,需要自行补充 Both <- GBD_population_prediction %>% group_by(location_name, year_id, age_group_name) %>% summarise(Both = sum(val)) %>% mutate('sex' = 'Both') %>% dplyr::rename('val' = 'Both') %>% select(colnames(GBD_population_prediction)) GBD_population_prediction <- rbind(GBD_population_prediction, Both) GBD_population_prediction <- GBD_population_prediction[GBD_population_prediction$location_name %in% counties_included,] ## 生成全球预测数据 glo <- GBD_population_prediction %>% group_by(sex, year_id, age_group_name) %>% summarise(val = sum(val)) %>% mutate('location_name' = 'Global') %>% select(colnames(GBD_population_prediction)) for (i in c('High SDI','High-middle SDI','Middle SDI','Low-middle SDI','Low SDI')){ country <- order[order$SDI == i, 'Country', drop =T] num <- GBD_population_prediction %>% dplyr::filter(location_name %in% country) %>% group_by(sex, year_id, age_group_name) %>% summarise(val = sum(val)) %>% mutate('location_name' = i) %>% select(colnames(GBD_population_prediction)) GBD_population_prediction <- rbind(GBD_population_prediction, num) } GBD_population_prediction <- rbind(GBD_population_prediction, glo) ## 完整的全球+5个SDI+195个国家(204-9)的预测数据 ### 将Early Neonatal,Late Neonatal,Post Neonatal 合并为 <1 year sep <- GBD_population_prediction[GBD_population_prediction$age_group_name %in% c('Early Neonatal','Late Neonatal','Post Neonatal'),] sep <- sep %>% group_by(location_name, sex, year_id) %>% summarise(val = sum(val)) %>% mutate(age_group_name = '<1 year') %>% select(1,2,3,5,4) sep2 <- GBD_population_prediction[!GBD_population_prediction$age_group_name %in% c('Early Neonatal','Late Neonatal','Post Neonatal'),] GBD_population_prediction <- rbind(sep, sep2) GBD_population_prediction <- unique(GBD_population_prediction) write.csv(GBD_population_prediction, file="./data/GBD_population_2019to2100.csv") ```