miércoles, 3 de octubre de 2018

Scrapenado Ips por pais en R

library(rvest)
library(curl)
library(dplyr)
library(data.table)

t0 <-Sys.time()
listaIP <- c(
    "5.188.10.8"
  , "5.62.56.55"
  , "5.62.58.55"
  , "5.9.158.75"
) %>%  unique

tablon <- data.table(ip = as.character(NA), inetnum = as.character(NA), Pais = as.character(NA),descr = as.character(NA)
                     , Tor = as.character(NA), amazon = as.character(NA), microsoft = as.character(NA) )



for( j in listaIP){
  url <- paste0('http://whois.chromefans.org/', j) 
  Pagina <- html(curl(url, handle = curl::new_handle("useragent" = "Mozilla/5.0")))
    # [contains(concat( " ", @class, " " ), concat( " ", "whois_info", " " ))]
  x <- html_nodes(Pagina,'.whois_info')
 
  funcionLimpia <- function(txt){
    txt %>%  gsub(x =  ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement =  "", perl=TRUE) %>%
      gsub(pattern = "[^(a-z)]", replacement = " ", x = . ) %>% trimws
  }
 
 
  PalabrasClave <- html_nodes(x,'strong') %>% html_text() %>% tolower %>% gsub(x =  ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement =  "", perl=TRUE) %>%
    gsub(pattern = "[^(a-z)]", replacement = " ", x = . ) %>% trimws
 
  PalabrasClave2 <- paste0( "(", PalabrasClave, ")")
  PalabrasClave2 <- PalabrasClave
  PalabraClave <- paste(PalabrasClave2, collapse = "|")
 
  a <-
    html_text(x, trim = TRUE) %>% tolower %>%
    gsub(pattern = "\n", replacement = " ", x = . , fixed = TRUE) %>%
   
    gsub(x =  ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement =  "", perl=TRUE) %>%
    strsplit(x = ., split = ":") %>%  unlist()
  a <- a[ a != ""]
 
 
  i <- grep(pattern = "inetnum", x = (a) )[1]
  inetnum <- a[i+1]  %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>%  toupper %>% trimws

  i <- grep(pattern = "descr", x = (a), fixed = TRUE )[1]
  descr <- a[i+1]  %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>%  toupper %>% trimws
 
  i <- grep(pattern = "country", x = (a) )[1]
  Pais <- a[i+1] %>%  funcionLimpia %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper

  i <- grep(pattern = "tor", x = (a) )
  Tor <- paste0( a[i], collapse = " @CH@ " )

  i <- grep(pattern = "amazon", x = (a) )
  amazon <- paste0( a[i], collapse = " @CH@ " )

  i <- grep(pattern = "microsoft", x = (a) )
  microsoft <- paste0( a[i], collapse = " @CH@ " )
 
  tablon <- rbind( tablon, data.table(ip = j, inetnum = inetnum, Pais = Pais, descr = descr, Tor = Tor, amazon = amazon, microsoft = microsoft))
 
  espera <- ifelse( test = is.na(Pais) , yes = 1, no = sample(x = 5:15, size = 1) )
  cat( "\nEsperamos ", espera,"\n")
  Sys.sleep( time = espera) # segundos de delay para que la web no cante. Total no son muchos
  cat( "\n", j,"terminado\n")
}

cat( "\n==============================================\n")
difftime( time1 = Sys.time(), time2 = t0, units = "m") %>%  as.numeric %>% round(2) %>%
  cat( "El proceso termino en ", ., " minutos\n")
cat( "\n==============================================\n")
tablon <- tablon[ !is.na(ip)]

No hay comentarios:

Publicar un comentario