R para jurimetria

4.1 Revisão

Obtenção dos nossos dados

library(tidyverse)
library(esaj)
library(glue)
library(abjutils)
path <- "data-raw/camaras" # onde salvar?
dir.create(path, showWarnings = FALSE)

Quais câmaras vamos baixar? OBS: Usa um pouco de stringr e dplyr, que veremos adiante.

# camaras <- cjsg_table("courts")
camaras <- read_rds("data/cjsg_camaras.rds")
id_camaras <- camaras %>% 
  filter(str_detect(court, "Câmara.*Direito Criminal$")) %>% 
  pull(id)

Quantas decisões no total?

peek_cjsg(query = "", courts = id_camaras,
          registration_start = "2017-12-01", registration_end = "2018-01-18")

Baixando decisões: CJSG

cjsg_path <- glue("{path}/cjsg")
download_cjsg(query = "", path = cjsg_path, 
              courts = id_camaras,
              registration_start = "2017-12-01", 
              registration_end = "2018-01-18",
              max_page = Inf, wait = 0.8)

cjsg_files <- dir(cjsg_path, full.names = TRUE, pattern = "page")

# parse
d_cjsg <- parse_cjsg(cjsg_files)

# salvando tibble parseada
write_rds(d_cjsg, glue("{path}/d_cjsg.rds"), compress = "bz2")

d_cjsg <- read_rds(glue("{path}/d_cjsg.rds"))
glimpse(d_cjsg)

#> Observations: 11,731
#> Variables: 14
#> $ file            <chr> "data-raw/camaras/cjsg/page100.html", "data-ra...
#> $ id_page         <chr> "1981", "1982", "1983", "1984", "1985", "1986"...
#> $ id_decision     <chr> "11094999", "11093733", "11093677", "11093270"...
#> $ id_lawsuit      <chr> "0057003-20.2017.8.26.0000", "0052762-03.2017....
#> $ class_subject   <chr> "Classe/Assunto:\n\t\t\t\t\t\t\t\t\t\t\t Habea...
#> $ district        <chr> "Cosmópolis", "São Paulo", "Ribeirão Preto", "...
#> $ court           <chr> "3ª Câmara de Direito Criminal", "3ª Câmara de...
#> $ dt_decision     <chr> "19/12/2017", "19/12/2017", "19/12/2017", "14/...
#> $ dt_publication  <chr> "19/12/2017", "19/12/2017", "19/12/2017", "19/...
#> $ dt_registration <chr> "19/12/2017", "19/12/2017", "19/12/2017", "19/...
#> $ rapporteur      <chr> "Luiz Antonio Cardoso", "Luiz Antonio Cardoso"...
#> $ summary         <chr> NA, NA, NA, "Execução Penal –  Comutação de Pe...
#> $ txt_summary     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ result          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

Baixando processos: CPOSG

cposg_path <- glue("{path}/cposg")

# baixando processos individuais
d_cjsg %>% 
  filter(!is.na(id_lawsuit)) %>% 
  pull(id_lawsuit) %>% 
  unique() %>% 
  clean_id() %>% 
  download_cposg(cposg_path)

cposg_files <- dir(cposg_path, full.names = TRUE)

# parse
parser <- make_parser() %>% 
  parse_data() %>% 
  parse_parts() %>% 
  parse_decisions()

# esse parser salva intermediários em rds
rds_path <- glue("{path}/cposg_rds")
run_parser(cposg_files, parser, path = rds_path)
rds_files <- dir(rds_path, full.names = TRUE)
d_cposg <- map_dfr(rds_files, read_rds) %>% 
  mutate(id_lawsuit = build_id(str_extract(id, "^[0-9]+"))) %>% 
  select(id, id_lawsuit, everything())

# salvando tibble parseada
write_rds(d_cposg, glue("{path}/d_cposg.rds"), compress = "bz2")

d_cposg <- read_rds(glue("{path}/d_cposg.rds"))
glimpse(d_cposg)

#> Observations: 11,762
#> Variables: 7
#> $ id         <chr> "00000037120168260073", "00000040920178260142", "00...
#> $ id_lawsuit <chr> "0000003-71.2016.8.26.0073", "0000004-09.2017.8.26....
#> $ file       <chr> "data-raw/camaras/cposg/00000037120168260073.html",...
#> $ hidden     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA...
#> $ data       <list> [<# A tibble: 11 x 2,    data              value  ...
#> $ parts      <list> [<# A tibble: 3 x 4,      id name                 ...
#> $ decisions  <list> [<# A tibble: 1 x 2,   date       decision        ...