library(tidyverse)
library(arrow)
library(readxl)
library(writexl)
library(fs)
source("load_core_data.R")
source("read_metadata.R")
source("shared.R")
prjs_geostatus <- read_parquet(here::here("data-processed",
"projects-geo-check-groups.parquet"),
col_select = c("prj_id", "geostatus"))
places_geostatus <- read_parquet(here::here("data-processed",
"places-geo-check.parquet"))
prj_id_nh <- prjs_geostatus %>% filter(geostatus == "více míst nehierarchicky") %>%
pull(prj_id)
plcnh <- places_geostatus %>% filter(prj_id %in% prj_id_nh)
file_nh_resolved_previously <- here::here("data-processed", "dtl-nehier", "all.parquet")
if(file.exists(file_nh_resolved_previously)) {
dtl_nh_resolved_previously <- read_parquet(file_nh_resolved_previously)
} else {
dtl_nh_resolved_previously <- tibble(prj_id = character())
}
dtl_nh_checked <- plcnh %>%
left_join(dts) %>%
inner_join(dtl %>%
filter(prj_id %in% prj_id_nh) %>%
select(prj_id, prj_radek, typ, value, level) %>%
spread(typ, value) %>%
select(prj_id, value = id, geo_id = shortid, nazev, level)) %>%
rename(geo_id_long = value, geo_nazev = nazev) %>%
mutate(vybrat = NA,
geo_id = ifelse(is.na(geo_id), geo_id_long, geo_id),
geo_id_long = if_else(level %in% c("obec", "zuj"), geo_id_long, NA_character_),
resolved = if_else(prj_id %in% dtl_nh_resolved_previously$prj_id,
TRUE, FALSE)) %>%
bind_rows(dtl_nh_resolved_previously %>%
select(-any_of(c("rozpad_duvod", "obec_puvod",
"rozpad_typ", "chunk", "radek")))) %>%
group_by(prj_id) %>%
mutate(oddelovac = if_else(row_number() == n(), "--------", NA_character_)) %>%
ungroup() %>%
relocate(op_id, prj_nazev, geo_id, geo_nazev, level, vybrat,
oddelovac, prj_anotace, resolved)
## Joining, by = c("op_id", "prj_id")
## Joining, by = c("prj_id", "value", "level")
writexl::write_xlsx(dtl_nh_checked %>% filter(!resolved),
here::here("data-output", "prj_nehierarchicke-vse.xlsx"))
dtl_nh_resolved_all <- read_xlsx(here::here(cnf$input_resolved_xlsx)) %>%
mutate(level = factor(level, levels = geolevels, ordered = T),
vybrat = as.logical(vybrat))
prjs_included_by_default <- dtl_nh_resolved_all %>%
group_by(prj_id) %>%
filter(all(is.na(vybrat))) %>% distinct(prj_id) %>% pull()
prjs_with_some_dropped <- dtl_nh_resolved_all %>%
group_by(prj_id) %>%
filter(!all(is.na(vybrat))) %>% distinct(prj_id) %>% pull()
prjs_with_all_dropped <- dtl_nh_resolved_all %>%
group_by(prj_id) %>%
filter(all(!vybrat)) %>% distinct(prj_id) %>% pull()
dtl_nh_resolved_true <- dtl_nh_resolved_all %>%
group_by(prj_id) %>%
# these are included by default - they were not touched by the user
mutate(all_na = all(is.na(vybrat))) %>%
# keep all those untouched and those lines where user said to keep
filter(all_na | vybrat)
length(unique(dtl_nh_resolved_true$prj_id))
## [1] 119
setdiff(unique(dtl_nh_resolved_true$prj_id), prj_id_nh)
## character(0)
dtl_nh_resolved_obce <- dtl_nh_resolved_true %>%
filter(level == "obec")
dtl_nh_resolved_zuj <- dtl_nh_resolved_true %>% filter(level == "zuj")
dtl_nh_resolved_ostatni <- dtl_nh_resolved_true %>% filter(!level %in% c("obec", "zuj"))
rozpadac_na_obce <- read_parquet(here::here("data-processed", "rozpadac-na-obce.parquet"))
dtl_nh_resolved_obce_fin <- dtl_nh_resolved_obce %>%
select(prj_id, geo_id, level_orig = level, op_id) %>%
mutate(rozpad_duvod = "manuálně vybrána obec",
obec_puvod = "manuální řešení původně nekonzistentní geolokace",
level = "obec",
geo_id_orig = geo_id)
dtl_nh_resolved_ostatni_fin <- dtl_nh_resolved_ostatni %>%
left_join(rozpadac_na_obce %>%
rename(level = rozpad_jak, geo_id = value)) %>%
select(prj_id, geo_id_orig = geo_id, level, geo_id = obec, op_id) %>%
mutate(level = "obec",
rozpad_duvod = "mechanický rozpad z manuálně vybraného ORP, okresu nebo kraje",
obec_puvod = "")
## Joining, by = c("geo_id", "level")
dtl_nh_resolved_zuj_fin <- dtl_nh_resolved_zuj %>%
ungroup() %>%
mutate(level_orig = level,
geo_id_orig = geo_id,
level = if_else(geo_id %in% ids$obec,
"obec",
as.character(level_orig)) %>%
factor(geolevels, ordered = T)) %>%
select(prj_id, op_id, geo_id, geo_id_orig, level, level_orig) %>%
mutate(rozpad_duvod = "manuálně vybrána ZÚJ; úroveň převedena na obec kde se obec a ZÚJ kryjí")
dtl_nh_resolved_all <- bind_rows(dtl_nh_resolved_obce_fin,
dtl_nh_resolved_zuj_fin,
dtl_nh_resolved_ostatni_fin) %>%
mutate(level_orig = as.character(level)) %>%
group_by(prj_id) %>%
mutate(radek = row_number(),
rozpad_typ = "více míst nehierarchicky",
obec_puvod = "manuální řešení původně nekonzistentní geolokace") %>%
add_chunk_number() %>%
select(prj_id, radek, level, geo_id, level_orig, geo_id_orig, rozpad_duvod,
obec_puvod, op_id, chunk, rozpad_typ) %>%
mutate(level_orig = as.character(level),
level = factor(level, levels = geolevels, ordered = T)) %>%
arrange(prj_id, geo_id, desc(level), level_orig) %>%
distinct(prj_id, op_id, geo_id, level, .keep_all = T) %>%
add_long_geoid(ids)
## Adding missing grouping variables: `op_id`
## Joining, by = c("prj_id", "op_id")
## Joining, by = c("level", "geo_id")
dir.create(here::here("data-processed", "dtl-nehier"), showWarnings = F)
write_parquet(dtl_nh_resolved_all, file_nh_resolved_previously)