Kompilace dat

schema <- schema(prj_id = string(),
                 radek = int32(),
                 op_id = string(),
                 level = dictionary(ordered = T),
                 geo_id = string(),
                 level_orig = string(),
                 geo_id_orig = string(),
                 rozpad_duvod = string(),
                 rozpad_typ = string(),
                 obec_puvod = string(),
                 chunk = int32(),
                 geo_id_long = string())

ds_stubs <- c("dtl_resolved_placeless", "dtl_resolved_smpl",
              "dtl-obecless-rozpadnute")

file_nh_resolved_previously <- here::here("data-processed", "dtl-nehier", "all.parquet")
if(file.exists(file_nh_resolved_previously)) {
  ds_stubs <- c(ds_stubs, "dtl-nehier")
}

dt_list <- purrr::map(ds_stubs, 
                      ~open_dataset(here::here("data-processed", .x), 
                                    schema = schema))

dt_all <- open_dataset(dt_list)
dim(dt_all)

## Warning: Number of rows unknown; returning NA

## [1] NA 12

try(fs::dir_delete(here::here(cnf$arrow_output_dir)), silent = T)

write_dataset(dt_all, path = here::here(cnf$arrow_output_dir), 
              partitioning = cnf$arrow_partitioning, 
              format = "parquet")

dss <- open_dataset(cnf$arrow_output_dir)

dim(dss)

## [1] 7164487      12

dss$schema

## Schema
## prj_id: string
## radek: int32
## level: dictionary<values=string, indices=int32, ordered>
## geo_id: string
## level_orig: string
## geo_id_orig: string
## rozpad_duvod: string
## geo_id_long: string
## obec_puvod: string
## op_id: string
## chunk: int32
## rozpad_typ: string

dss %>% head() %>% collect()

ABCDEFGHIJ0123456789

prj_id <chr>	radek <int>	level <ord>	geo_id <chr>	level_orig <chr>	geo_id_orig <chr>
CZ.05.4.27/0.0/0.0/17_088/0008643	1	obec	545112	chkonp	NA
CZ.05.4.27/0.0/0.0/17_088/0008643	2	obec	544931	chkonp	NA
CZ.05.4.27/0.0/0.0/17_088/0008643	3	obec	549533	chkonp	NA
CZ.05.4.27/0.0/0.0/17_088/0008643	4	obec	549401	chkonp	NA
CZ.05.4.27/0.0/0.0/17_088/0008643	5	obec	556874	chkonp	NA
CZ.05.4.27/0.0/0.0/17_088/0008643	6	obec	556980	chkonp	NA

dss %>% 
  select(chunk, op_id) %>% 
  collect() %>% 
  count(chunk, op_id)

dss %>% 
  select(obec_puvod, rozpad_typ, op_id) %>% 
  collect() %>% 
  distinct(obec_puvod, rozpad_typ, op_id)

prj_smpl <- dss %>% 
  select(prj_id, op_id, rozpad_typ, obec_puvod) %>% 
  collect() %>% 
  distinct() %>% 
  group_by(op_id, rozpad_typ, obec_puvod) %>% 
  sample_n(min(n(), 10))

dss_sample <- dss %>% 
  filter(prj_id %in% prj_smpl$prj_id) %>% 
  collect()

writexl::write_xlsx(dss_sample, here::here("sample_export.xlsx"))

writexl::write_xlsx(dss %>% 
                      filter(op_id == "OP Z" & chunk == 1) %>% 
                      collect(), here::here("opz-one-chunk.xlsx"))

dss_all <- dss %>% collect()