# dressbarn-closings-scrape-light-analysis.R -rw-r--r-- 4.5 KiB View raw
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
library(rvest)
library(stringi)
library(urltools)
library(worldtilegrid)
library(statebins)
library(tidyverse)

# this is the dressbarn locations directory page
pg <- read_html("https://locations.dressbarn.com/")

# this is the selector to get the main links
html_nodes(pg, "a.Directory-listLink") %>% 
  html_attr("href") -> locs

# PRE-NOTE
# No sleep() code (I looked at the web site, saw how many self-requests it makes for all DB
# resources and concluded that link scrapes + full page captures would not be burdensome
# plus they're going out of business)

# basic idea here is to get all the main state location pages
# some states only have one store so the link goes right to that so handle that condition
# for ones with multiple stores get all the links on the state index page
# for links on state index page that have multiple stores in one area,
# grab all those; then, concatenate all the final target store links into one 
# character vector.

keep(locs, ~nchar(.x) == 2) %>% 
  sprintf("https://locations.dressbarn.com/%s", .) %>% # state has multiple listings
  map(
    ~read_html(.x) %>% 
      html_nodes("a.Directory-listLink") %>% 
      html_attr("href") %>% 
      sprintf("https://locations.dressbarn.com/%s", .)
  ) %>% 
  append(
    keep(locs, ~nchar(.x) > 2) %>% sprintf("https://locations.dressbarn.com/%s", .) # state has one store
  ) %>% 
  flatten_chr() %>% 
  map_if(
    ~stri_count_fixed(.x, "/") == 4, # 4 URL parts == there's another listing page layer
    ~read_html(.x) %>% 
      html_nodes("a.Teaser-titleLink") %>% 
      html_attr("href") %>% 
      stri_replace_first_fixed("../", "") %>% 
      sprintf("https://locations.dressbarn.com/%s", .)
  )  %>% 
  flatten_chr() -> listings

# make a tibble with the HTML source for the final store location pages
# so we don't end up doing multiple retrievals

tibble(
  listing = listings,
  html_src = map_chr(listings, ~httr::GET(.x) %>% httr::content(as = "text"))
) -> dress_barn

# save off our work in the event we have a (non-R-crashing) issue
tf <- tempfile(fileext = ".rds")
print(tf)
saveRDS(dress_barn, tf) 

# now, get data from the pages
#
# first, turn all the character vectors into something we can get HTML nodes from
#
# dressbarn web folks handliy put an "uber" link on each page so we get lon/lat for free in that URL
# they also handily used an <address> semantic tag in the proper PostalAddress schema format
# so we can get locality and actual address, too
mutate(
  dress_barn,
  parsed = map(html_src, read_html),
  uber_link = 
    map_chr(
      parsed, ~html_nodes(.x, xpath=".//a[contains(@href, 'uber')]") %>% 
        html_attr("href") 
    ), 
  locality = map_chr(
    parsed, ~html_node(.x, xpath=".//address/meta[@itemprop = 'addressLocality']") %>% 
      html_attr("content")
  ),
  address = map_chr(
    parsed, ~html_node(.x, xpath=".//address/meta[@itemprop = 'streetAddress']") %>% 
      html_attr("content")
  ),
  state = stri_match_first_regex(
    dress_barn$listing, 
    "https://locations.dressbarn.com/([[:alpha:]]+)/.*$"
  )[,2]
) %>% 
  bind_cols(
    param_get(.$uber_link, c("dropoff%5Blatitude%5D", "dropoff%5Blongitude%5D")) %>% 
      as_tibble() %>% 
      set_names(c("lat", "lon")) %>%
      mutate_all(as.double)
  ) -> dress_barn

# save off our hard work with the HTML source so we can do more later if need be
select(dress_barn, -parsed) %>% 
  saveRDS("~/Data/dressbarn-with-src.rds")

# save off something others will want
select(dress_barn, -parsed, -html_src, -listing) %>% 
  jsonlite::toJSON() %>% 
  write_lines("~/Data/dressbarn-locations.json.gz")

# simple map
ggplot(dress_barn, aes(lon, lat)) + 
  geom_jitter(size = 0.25, color = ft_cols$yellow, alpha = 1/2) +
  coord_map("polyconic") +
  labs(
    title = "Locations of U.S. Dressbarn Stores",
    subtitle = "All 650 locations closing",
    caption = "Source: Dressbarn HTML store listings;\nData: <https://rud.is/dl/dressbarn-locations.json.gz> via @hrbrmstr"
  ) +
  theme_ft_rc(grid="") +
  theme_enhance_wtg()

unlink(tf) # cleanup 

count(dress_barn, state) %>% 
  left_join(tibble(name = state.name, state = tolower(state.abb))) %>% 
  left_join(usmap::statepop, by = c("name"="full")) %>% 
  mutate(per_capita = (n/pop_2015) * 1000000) %>% 
  arrange(desc(per_capita)) %>% 
  select(name, n, per_capita) %>% 
  arrange(desc(per_capita)) %>% 
  complete(name = state.name) %>% 
  statebins(state_col = "name", value_col = "per_capita", ) +
  scale_fill_r7c("Closing\nper-capita") +
  labs(title = "Dressbarn State per-capital closings") +
  theme_ipsum_rc(grid="") +
  theme_enhance_wtg()