# pt-example.R -rw-r--r-- 6.4 KiB View raw
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#' ---
#' title: "Working with the PacketTotal API in R"
#' author: ""
#' date: ""
#' output:
#'   html_document:
#'     keep_md: true
#'     theme: simplex
#'     highlight: monochrome
#' ---
#+ init, include=FALSE
knitr::opts_chunk$set(message = FALSE, warning = FALSE, dev="png", collapse = TRUE,
                      fig.retina = 2, fig.width = 10, fig.height = 6)

#+ begin

#' The crazy/kind folks over at [PacketTotal](https://packettotal.com/) were
#' generoue enough to slip me an [API key](https://packettotal.com/api.html), and 
#' long-time readers of the blog knows what that means: a new [package](https://cinc.rud.is/web/packages/packettotal/)!
#' 
#' ### What is PacketTotal?
#' 
#' If you have a non-compliance-focused job in information security chances are you
#' will have come across or had the need to generate [packet captures](https://en.wikipedia.org/wiki/Pcap)
#' of network traffic to chase down a situation. PacketTotal seems to be aiming to 
#' aggregate and socialize the analysis of packet captures in similar fashion to
#' what [VirusTotal](https://www.virustotal.com/) does to files/binaries.
#' 
#' PCAPs are a bit trickier than what VirusTotal handles since they may contain 
#' sensitive organizational data — at the very least private addressing schemes 
#' — but, I suspect they're working on some sanitization tools to make it easier 
#' to do that and are also doing a decent job at ensuring they're not logging the IP address
#' (or any other identifying data) of the uploader.
#' 
#' Their [online exploratory interface](https://packettotal.com/app/search?q=) is fairly
#' robust but by providing an API they make it possible for one to go beyond such 
#' an interface and enhance a dynamic investigation on-the-fly while keeping a record of 
#' analysis flow and artifacts.
#' 
#' We won't be doing that in this post since it is just an introductory "this is how
#' the site/package works" post but once they round out some corners we may delve into a 
#' full (faux) investigation and perhaps write our own investigations UX with Shiny.
#' 
#' Onwards!
#' 
#' ### Using the PacketTotal API
#' 
#' I kept the dependencies pretty thin so the extra `library()` calls I'm putting in here 
#' are mostly for analysis & visualization support. Let's get them out of the way:

#+ libs
library(zip)
library(DT)
library(packettotal)
library(lubridate)
library(hrbrthemes)
library(tidyverse)

#' Now, let's look for [Emotet](https://www.us-cert.gov/ncas/alerts/TA18-201A),
#' which is a nasty piece of malware your organization has likely been hit with multiple
#' times by now. To do that, we need to do issue a query on the "deep search"
#' endpoint:

es <- pt_deep_search("emotet")

#' Now, we get thos results and take a look:
emo_res <- pt_get_search_results(es)

head(emo_res$results, 10)

#' Let's get even more detail:
emo_det <- pt_detail("5b4eb1fc54db6761bb42385d1ac52b8a")

#' and, see what's in the summary:
str(emo_det$analysis_summary, 1)

#' Who are the top talkers (the IP addresses with the most connections)?
str(emo_det$analysis_summary$top_talkers)

#' Let's use [ipinfo.io](https://ipinfo.io/) to see some extra detail on that main one:
ip_5.187.0.158 <- ipinfo::query_ip("5.187.0.158")

str(ip_5.187.0.158)

#' We can also lookup various stats (these JSON strings are going to be real 
#' percentages soon from the API):

str(emo_det$analysis_summary$dns_statistics)

str(emo_det$analysis_summary$file_statistics)

#' So, we get FQDNs, files, DNS queries and more. We can also just get
#' every bit of data PacketTotal could squeeze out of the PCAP by downloading
#' an "analysis" archive:

dl <- pt_download("5b4eb1fc54db6761bb42385d1ac52b8a", dl_dir = "~/Data")

#' We'll unpack it and take a look:

unzip(dl, exdir = "~/Data/5b4eb1fc54db6761bb42385d1ac52b8a")

list.files("~/Data/5b4eb1fc54db6761bb42385d1ac52b8a")

#' We won't explore all of these in this post but `conn.csv` is the Zeek 
#' (formerly, ugh, 'Bro' &mdash; which was short for 'Big Brother' b/c it was
#' snooping on your packets, but still&hellip;) connection logs. That's something
#' I'm super familiar with given that we generate tens of thousands of them every 
#' day at $WORK in our massive honeypot network, so let's poke at it:

read_csv("~/Data/5b4eb1fc54db6761bb42385d1ac52b8a/conn.csv", na = c("null", "")) %>% 
  janitor::clean_names() -> conns

glimpse(conns)

#' (They're also fixing the un-friendly-for-data science column names.)
#' 
#' Lots of info about the connections, and we can make our own exploratory
#' interface for them pretty easily:

DT::datatable(conns)

#' But, we can also attack it with the tidyverse:

count(conns, target_port, service, sort=TRUE)

count(conns, sender_ip, sort=TRUE)

count(conns, target_ip, sort=TRUE)

mutate(conns, sec = floor_date(timestamp, "minute")) %>% 
  count(sec, transport_protocol) %>% 
  ggplot(aes(sec, n)) + 
  geom_line() +
  facet_wrap(~transport_protocol) +
  labs(title = "Total Connections-per-minute by Protocol") +
  theme_ft_rc(grid="XY")

select(conns, payload_bytes_sent, payload_bytes_received) %>% 
  gather(measure, value) %>% 
  mutate(value = as.numeric(value)) %>% 
  ggplot(aes(value)) +
  ggalt::geom_bkde(fill = alpha(ft_cols$gray, 1/3)) +
  scale_x_log10(label=scales::comma) +
  labs(title = "Payload metadata distributions", subtitle = "Note: Log10 Scale") +
  facet_wrap(~measure) +
  theme_ft_rc(grid="XY")

#' We can even see any threat inteligence they were able to enrich the 
#' data with:

read_csv("~/Data/5b4eb1fc54db6761bb42385d1ac52b8a/intel.csv", na = c("null", "")) %>% 
  janitor::clean_names() %>% 
  DT::datatable()

#' We can also look for similar PCAPs:

sim <- pt_similar("5b4eb1fc54db6761bb42385d1ac52b8a")

str(sim$similar$results, 1)

#' This is where the power of the API would really come in handy as we 
#' collect all this information and start to look for correlations, 
#' time series patterns (or anomalies) and possibly extract features
#' to help build models to detect various types of malicious traffic.
#' 
#' ### FIN
#' 
#' Visit the [package page](https://cinc.rud.is/web/packages/packettotal/) for information
#' on how to install it and you can find it on [SourceHut](https://git.sr.ht/~hrbrmstr/packettotal),
#' [GitLab](https://gitlab.com/hrbrmstr/packettotal) or (ugh) [GitHub](https://github.com/hrbrmstr/packettotal).
#' 
#' Keep watching their service/API since it's only going to get even better and 
#' definitely toss up suggestions for package features or jump on in and file some 
#' PRs at your social coding hub of choice.