schema <- schema(prj_id = string(),
                 radek = int32(),
                 op_id = string(),
                 level = dictionary(ordered = T),
                 geo_id = string(),
                 level_orig = string(),
                 geo_id_orig = string(),
                 rozpad_duvod = string(),
                 rozpad_typ = string(),
                 obec_puvod = string(),
                 chunk = int32(),
                 geo_id_long = string())

ds_stubs <- c("dtl_resolved_placeless", "dtl_resolved_smpl",
              "dtl-obecless-rozpadnute")

file_nh_resolved_previously <- here::here("data-processed", "dtl-nehier", "all.parquet")
if(file.exists(file_nh_resolved_previously)) {
  ds_stubs <- c(ds_stubs, "dtl-nehier")
}

dt_list <- purrr::map(ds_stubs, 
                      ~open_dataset(here::here("data-processed", .x), 
                                    schema = schema))
dt_all <- open_dataset(dt_list)
dim(dt_all)
## Warning: Number of rows unknown; returning NA
## [1] NA 12
try(fs::dir_delete(here::here(cnf$arrow_output_dir)), silent = T)
write_dataset(dt_all, path = here::here(cnf$arrow_output_dir), 
              partitioning = cnf$arrow_partitioning, 
              format = "parquet")
dss <- open_dataset(cnf$arrow_output_dir)
dim(dss)
## [1] 7164487      12
dss$schema
## Schema
## prj_id: string
## radek: int32
## level: dictionary<values=string, indices=int32, ordered>
## geo_id: string
## level_orig: string
## geo_id_orig: string
## rozpad_duvod: string
## geo_id_long: string
## obec_puvod: string
## op_id: string
## chunk: int32
## rozpad_typ: string
dss %>% head() %>% collect()
dss %>% 
  select(chunk, op_id) %>% 
  collect() %>% 
  count(chunk, op_id)
dss %>% 
  select(obec_puvod, rozpad_typ, op_id) %>% 
  collect() %>% 
  distinct(obec_puvod, rozpad_typ, op_id)
prj_smpl <- dss %>% 
  select(prj_id, op_id, rozpad_typ, obec_puvod) %>% 
  collect() %>% 
  distinct() %>% 
  group_by(op_id, rozpad_typ, obec_puvod) %>% 
  sample_n(min(n(), 10))
dss_sample <- dss %>% 
  filter(prj_id %in% prj_smpl$prj_id) %>% 
  collect()
writexl::write_xlsx(dss_sample, here::here("sample_export.xlsx"))
writexl::write_xlsx(dss %>% 
                      filter(op_id == "OP Z" & chunk == 1) %>% 
                      collect(), here::here("opz-one-chunk.xlsx"))
dss_all <- dss %>% collect()