1

我正在使用 raster 和 glcm 包来计算卫星图像上的 Haralick 纹理特征。我已经使用单核成功运行了 glcm() 函数,但正在并行运行它。这是我正在使用的代码:

# tiles is a list of raster extents, r is a raster

registerDoParallel(7)

out_raster = foreach(i=1:length(tiles),.combine = merge,.packages=c("raster","glcm")) %dopar% 

    glcm(crop(r,tiles[[i]]), n_grey=16, window=c(17,17), shift = c(1,1),
             min_x = rmin, max_x = rmax)

当我检查创建的临时文件时,似乎合并的每个步骤都会创建一个临时文件,这会占用大量硬盘空间。这是整体图像(2GB):

全光栅

这里有两个临时文件:Merge Step 1 Merge Step 2

由于每个切片的 glcm 函数输出为 3 GB,因此为每个逐步合并操作创建一个临时文件会创建约 160 GB 的临时栅格文件。有没有更节省空间的方法来并行运行它?

4

1 回答 1

0

我设法通过使用 gdal 和构建 vrts 来节省硬盘空间。下面是我在 glcm 包中的示例数据上编写的代码。步骤是 1:创建瓦片的 vrt 文件;2)在每个vrt tile上并行运行glcm函数(见glcm_parallel函数);3)将瓦片合并成一个vrt并使用gdal warp写入输出光栅。vrt 文件非常小,唯一的临时文件只是那些由 glcm 函数创建的文件。这对大型栅格应该有很大帮助。

#Load Packages
library(raster)
library(sf)
library(rgdal)
library(glcm)
library(doParallel)
library(gdalUtils)

#Source helper functions
source("./tilebuild_buff.R")
source("./glcm_parallel.R")

#Read raster - example data from glcm package (saved to disk)
rasterfile = "./L5TSR_1986.tif"
r = raster("L5TSR_1986.tif")

#Create tiles directory if it doesn't exist and clear files if it exists
if(!file.exists("./tiles")){dir.create("./tiles")}
file.remove(list.files("./tiles/",full.names=T))

#Calculate tiles for parallel processing - returns x and y offsets, and widths
#to use with gdal_translate
jobs_buff = tilebuild_buff(r,nx=5,ny=2,buffer=c(5,5))

#Create vrt files for buffered tiles
for (i in 1:length(jobs_buff[,1])){
  fin = rasterfile
  fout = paste0("./tiles/t_",i,'.vrt')
  ex = as.numeric(jobs_buff[i,])
  gdal_utils('translate',fin,fout,options = c('-srcwin',ex,'-of','vrt'))
}

#Read in vrt files of raster tiles and set the nodata value
input.rasters = lapply(paste0("./tiles/", list.files("./tiles/",pattern="\\.vrt$")), raster)

for(i in 1:length(input.rasters)){ NAvalue(input.rasters[[i]])= -3.4E38 }

#Create a directory for temporary raster grids and clear files
tempdir = "./rastertemp/"
if(!file.exists(tempdir)){dir.create(tempdir)}
file.remove(list.files(tempdir,full.names=T))


registerDoParallel(6) 

#Determine min and max values over original raster
rmin = cellStats(r,'min')
rmax = cellStats(r,'max')

#Run glcm function in parallel
glcm_split = foreach(i=1:length(jobs_buff[,1]),.packages=c("raster","glcm")) %dopar%
  glcm_parallel(inlist = input.rasters,temp=tempdir,window=c(3,3),n_grey=16,
                min_x=rmin,max_x=rmax)

#Get list of temp raster files created by glcm function
temps = paste0(tempdir,list.files(tempdir,pattern="\\.grd$"))

#trim off buffer (from tilebuild_buff function) and create mosaic raster
mosaic_rasters(temps,dst_dataset = "./mosaic.vrt", trim_margins = 5, srcnodata=-3.4E38,overwrite=T)

#write output tif
vrt_mosaic = "./mosaic.vrt"
outtif = "./final_merged.tif"
gdalwarp(vrt_mosaic,outtif,overwrite=T,verbose=T)

两个辅助函数在这里:

glcm_parallel <- function(inlist, temp, n_grey=16, window=c(11,11), shift=c(1,1), min_x=NULL, max_x=NULL){

  require(glcm)
  #todisk option required if output rasters are small enough to fit in memory
  rasterOptions(tmpdir = temp, todisk=T, maxmemory = 1E8)

  ## run glcm over tile
  r_glcm=glcm(inlist[[i]], n_grey = n_grey, window = window, shift=shift, min_x=min_x, max_x=max_x, na_opt = 'any')

}

和这里:

tilebuild_buff <- function(r, nx=5, ny=2, buffer=c(0,0)){

round_xw = floor(ncol(r)/nx)
xsize = c(rep(round_xw,nx-1), round_xw + ncol(r)%%nx)
xoff = c(0,cumsum(rep(round_xw,nx-1)))

round_yh = floor(nrow(r)/ny)
ysize = c(rep(round_yh,ny-1), round_yh + nrow(r)%%ny)
yoff = c(0,cumsum(rep(round_yh,ny-1)))

pix_widths = expand.grid(xsize = xsize ,ysize = ysize)
offsets = expand.grid(xoff = xoff,yoff = yoff)

srcwins = cbind(offsets,pix_widths)

srcwins_buff = srcwins

#Add buffer
srcwins_buff$xoff = srcwins$xoff - buffer[1]
srcwins_buff$yoff = srcwins$yoff - buffer[2]
srcwins_buff$xsize = srcwins$xsize + 2*buffer[1]
srcwins_buff$ysize = srcwins$ysize + 2*buffer[2]

return(srcwins_buff)

}
于 2020-03-26T15:31:21.577 回答