library(rvest) library(stringi) library(urltools) library(worldtilegrid) library(statebins) library(tidyverse) # this is the dressbarn locations directory page pg <- read_html("https://locations.dressbarn.com/") # this is the selector to get the main links html_nodes(pg, "a.Directory-listLink") %>% html_attr("href") -> locs # PRE-NOTE # No sleep() code (I looked at the web site, saw how many self-requests it makes for all DB # resources and concluded that link scrapes + full page captures would not be burdensome # plus they're going out of business) # basic idea here is to get all the main state location pages # some states only have one store so the link goes right to that so handle that condition # for ones with multiple stores get all the links on the state index page # for links on state index page that have multiple stores in one area, # grab all those; then, concatenate all the final target store links into one # character vector. keep(locs, ~nchar(.x) == 2) %>% sprintf("https://locations.dressbarn.com/%s", .) %>% # state has multiple listings map( ~read_html(.x) %>% html_nodes("a.Directory-listLink") %>% html_attr("href") %>% sprintf("https://locations.dressbarn.com/%s", .) ) %>% append( keep(locs, ~nchar(.x) > 2) %>% sprintf("https://locations.dressbarn.com/%s", .) # state has one store ) %>% flatten_chr() %>% map_if( ~stri_count_fixed(.x, "/") == 4, # 4 URL parts == there's another listing page layer ~read_html(.x) %>% html_nodes("a.Teaser-titleLink") %>% html_attr("href") %>% stri_replace_first_fixed("../", "") %>% sprintf("https://locations.dressbarn.com/%s", .) ) %>% flatten_chr() -> listings # make a tibble with the HTML source for the final store location pages # so we don't end up doing multiple retrievals tibble( listing = listings, html_src = map_chr(listings, ~httr::GET(.x) %>% httr::content(as = "text")) ) -> dress_barn # save off our work in the event we have a (non-R-crashing) issue tf <- tempfile(fileext = ".rds") print(tf) saveRDS(dress_barn, tf) # now, get data from the pages # # first, turn all the character vectors into something we can get HTML nodes from # # dressbarn web folks handliy put an "uber" link on each page so we get lon/lat for free in that URL # they also handily used an
semantic tag in the proper PostalAddress schema format # so we can get locality and actual address, too mutate( dress_barn, parsed = map(html_src, read_html), uber_link = map_chr( parsed, ~html_nodes(.x, xpath=".//a[contains(@href, 'uber')]") %>% html_attr("href") ), locality = map_chr( parsed, ~html_node(.x, xpath=".//address/meta[@itemprop = 'addressLocality']") %>% html_attr("content") ), address = map_chr( parsed, ~html_node(.x, xpath=".//address/meta[@itemprop = 'streetAddress']") %>% html_attr("content") ), state = stri_match_first_regex( dress_barn$listing, "https://locations.dressbarn.com/([[:alpha:]]+)/.*$" )[,2] ) %>% bind_cols( param_get(.$uber_link, c("dropoff%5Blatitude%5D", "dropoff%5Blongitude%5D")) %>% as_tibble() %>% set_names(c("lat", "lon")) %>% mutate_all(as.double) ) -> dress_barn # save off our hard work with the HTML source so we can do more later if need be select(dress_barn, -parsed) %>% saveRDS("~/Data/dressbarn-with-src.rds") # save off something others will want select(dress_barn, -parsed, -html_src, -listing) %>% jsonlite::toJSON() %>% write_lines("~/Data/dressbarn-locations.json.gz") # simple map ggplot(dress_barn, aes(lon, lat)) + geom_jitter(size = 0.25, color = ft_cols$yellow, alpha = 1/2) + coord_map("polyconic") + labs( title = "Locations of U.S. Dressbarn Stores", subtitle = "All 650 locations closing", caption = "Source: Dressbarn HTML store listings;\nData: