library(curl)
library(dplyr)
library(data.table)
t0 <-Sys.time()
listaIP <- c(
"5.188.10.8"
, "5.62.56.55"
, "5.62.58.55"
, "5.9.158.75"
) %>% unique
tablon <- data.table(ip = as.character(NA), inetnum = as.character(NA), Pais = as.character(NA),descr = as.character(NA)
, Tor = as.character(NA), amazon = as.character(NA), microsoft = as.character(NA) )
for( j in listaIP){
url <- paste0('http://whois.chromefans.org/', j)
Pagina <- html(curl(url, handle = curl::new_handle("useragent" = "Mozilla/5.0")))
# [contains(concat( " ", @class, " " ), concat( " ", "whois_info", " " ))]
x <- html_nodes(Pagina,'.whois_info')
funcionLimpia <- function(txt){
txt %>% gsub(x = ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement = "", perl=TRUE) %>%
gsub(pattern = "[^(a-z)]", replacement = " ", x = . ) %>% trimws
}
PalabrasClave <- html_nodes(x,'strong') %>% html_text() %>% tolower %>% gsub(x = ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement = "", perl=TRUE) %>%
gsub(pattern = "[^(a-z)]", replacement = " ", x = . ) %>% trimws
PalabrasClave2 <- paste0( "(", PalabrasClave, ")")
PalabrasClave2 <- PalabrasClave
PalabraClave <- paste(PalabrasClave2, collapse = "|")
a <-
html_text(x, trim = TRUE) %>% tolower %>%
gsub(pattern = "\n", replacement = " ", x = . , fixed = TRUE) %>%
gsub(x = ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement = "", perl=TRUE) %>%
strsplit(x = ., split = ":") %>% unlist()
a <- a[ a != ""]
i <- grep(pattern = "inetnum", x = (a) )[1]
inetnum <- a[i+1] %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper %>% trimws
i <- grep(pattern = "descr", x = (a), fixed = TRUE )[1]
descr <- a[i+1] %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper %>% trimws
i <- grep(pattern = "country", x = (a) )[1]
Pais <- a[i+1] %>% funcionLimpia %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper
i <- grep(pattern = "tor", x = (a) )
Tor <- paste0( a[i], collapse = " @CH@ " )
i <- grep(pattern = "amazon", x = (a) )
amazon <- paste0( a[i], collapse = " @CH@ " )
i <- grep(pattern = "microsoft", x = (a) )
microsoft <- paste0( a[i], collapse = " @CH@ " )
tablon <- rbind( tablon, data.table(ip = j, inetnum = inetnum, Pais = Pais, descr = descr, Tor = Tor, amazon = amazon, microsoft = microsoft))
espera <- ifelse( test = is.na(Pais) , yes = 1, no = sample(x = 5:15, size = 1) )
cat( "\nEsperamos ", espera,"\n")
Sys.sleep( time = espera) # segundos de delay para que la web no cante. Total no son muchos
cat( "\n", j,"terminado\n")
}
cat( "\n==============================================\n")
difftime( time1 = Sys.time(), time2 = t0, units = "m") %>% as.numeric %>% round(2) %>%
cat( "El proceso termino en ", ., " minutos\n")
cat( "\n==============================================\n")
tablon <- tablon[ !is.na(ip)]
No hay comentarios:
Publicar un comentario