schema <- schema(prj_id = string(),
radek = int32(),
op_id = string(),
level = dictionary(ordered = T),
geo_id = string(),
level_orig = string(),
geo_id_orig = string(),
rozpad_duvod = string(),
rozpad_typ = string(),
obec_puvod = string(),
chunk = int32(),
geo_id_long = string())
ds_stubs <- c("dtl_resolved_placeless", "dtl_resolved_smpl",
"dtl-obecless-rozpadnute")
file_nh_resolved_previously <- here::here("data-processed", "dtl-nehier", "all.parquet")
if(file.exists(file_nh_resolved_previously)) {
ds_stubs <- c(ds_stubs, "dtl-nehier")
}
dt_list <- purrr::map(ds_stubs,
~open_dataset(here::here("data-processed", .x),
schema = schema))
dt_all <- open_dataset(dt_list)
dim(dt_all)
## Warning: Number of rows unknown; returning NA
## [1] NA 12
try(fs::dir_delete(here::here(cnf$arrow_output_dir)), silent = T)
write_dataset(dt_all, path = here::here(cnf$arrow_output_dir),
partitioning = cnf$arrow_partitioning,
format = "parquet")
dss <- open_dataset(cnf$arrow_output_dir)
dim(dss)
## [1] 7164487 12
dss$schema
## Schema
## prj_id: string
## radek: int32
## level: dictionary<values=string, indices=int32, ordered>
## geo_id: string
## level_orig: string
## geo_id_orig: string
## rozpad_duvod: string
## geo_id_long: string
## obec_puvod: string
## op_id: string
## chunk: int32
## rozpad_typ: string
dss %>% head() %>% collect()
dss %>%
select(chunk, op_id) %>%
collect() %>%
count(chunk, op_id)
dss %>%
select(obec_puvod, rozpad_typ, op_id) %>%
collect() %>%
distinct(obec_puvod, rozpad_typ, op_id)
prj_smpl <- dss %>%
select(prj_id, op_id, rozpad_typ, obec_puvod) %>%
collect() %>%
distinct() %>%
group_by(op_id, rozpad_typ, obec_puvod) %>%
sample_n(min(n(), 10))
dss_sample <- dss %>%
filter(prj_id %in% prj_smpl$prj_id) %>%
collect()
writexl::write_xlsx(dss_sample, here::here("sample_export.xlsx"))
writexl::write_xlsx(dss %>%
filter(op_id == "OP Z" & chunk == 1) %>%
collect(), here::here("opz-one-chunk.xlsx"))
dss_all <- dss %>% collect()