Nescanos <- 57
PorMin <- 0.05
df <- data.frame( Carmena = .284, PP = .219, Ciudantans = .218, PSOE = .137, Franco = .082, IU = 0.023, Total = 1642898)
df <- df[ , df >= PorMin ]
df <- df[ , order(df, decreasing = TRUE)]
df <- df$Total * df
df$PP <- df$PP + 3105
# EscañoPara
# Carmena Ciudantans Franco PP PSOE
# 18 13 5 13 8
df$Total <- rm()
EscanosDF <- df
for( i in 2:Nescanos ){
EscanosDF <- rbind( EscanosDF , df/i)
}
EscañoPara <- c()
i <- 1 +1
for( i in 1:Nescanos){
cat("repartiendo escaño ", i, "\n")
print( sapply(X = EscanosDF, FUN = max, na.rm = TRUE) )
print( sapply(X = EscanosDF, FUN = function(x){ max( length( x [ is.na(x) == TRUE] ) ) + 1 }) )
Maximo <- max( EscanosDF , na.rm = TRUE)[1]
Fila <- which( apply(X = EscanosDF, MARGIN = 1, FUN = function(x){ any(x == Maximo)}) )
Fila <- Fila[ length(Fila)]
Columna <- which.max( EscanosDF[Fila, ] )[1]
nombrePartido <- names(Columna)
EscanosDF[ Fila, Columna] <- NA
EscañoPara <- c(EscañoPara, nombrePartido )
cat("Escaño Para", nombrePartido, "\n")
}
table( EscañoPara)
EscañoPara
Neurociencia y Matemáticas
Mi nombre es Javier Villacampa soy cientifico de datos, matemático y un apasionado en la neurociencia cognitiva. Os doy la bienvenida a mi blog donde exploraremos distintos usos de las matemáticas (Usaremos R como herramienta estándar). Además también hablaremos de algunos artículos cientificos interesantes sobre neurociencia. Espero que disfruteis.
miércoles, 22 de enero de 2020
Paseando por shiny dygraph y R Markdown
Dy grpah es una de las librearias más diveridas vamos a ver como generar grupo de gráficos, tablas y grafico interactivos con markdown y shiny
---
title: "Pruebas Shiny"
author: "jvigo6n"
date: "21 de enero de 2020"
output: html_document
# ioslides_presentation:
# keep_md: yes
# widescreen: yes
runtime: shiny
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
```
# La bolse y R (Paquete quantMode) {.tabset }
Vamos a hacer un mardown para tradear, para ver los nombres de las compañias aquí [YahooFinanciero](https://finance.yahoo.com/quote/TEF?p=TEF&.tsrc=fin-srch ).
## Genera Grupos de dygraph {.tabset }
```{r dygraphs sincro , echo=FALSE, error=FALSE, warning= FALSE, message=FALSE}
# https://finance.yahoo.com/quote/TEF?p=TEF&.tsrc=fin-srch
library(dplyr)
library(dygraphs)
library(DT)
library(quantmod)
tickers <- c("AAPL", "MSFT", "IBM", "TEF")
getSymbols(tickers)
closePrices <- do.call(merge, lapply(tickers, function(x) Cl(get(x))))
dateWindow <- c("2008-01-01", "2009-01-01")
dygraph(closePrices, main = "Value", group = "stock") %>%
dyRebase(value = 100) %>%
dyRangeSelector(dateWindow = dateWindow)
dygraph(closePrices, main = "Percent", group = "stock") %>%
dyRebase(percent = TRUE) %>%
dyRangeSelector(dateWindow = dateWindow)
dygraph(closePrices, main = "None", group = "stock2") %>%
dyRangeSelector(dateWindow = dateWindow)
```
## Interactive Table {.tabset }
```{r dygraphs}
# https://shiny.rstudio.com/gallery/basic-datatable.html
closePrices2 <- closePrices %>% data.frame()
closePrices2$Fecha <- index(closePrices)
closePrices2 <-
closePrices2[ , c("Fecha", names(closePrices2) [ names(closePrices2) != "Fecha"] )]
row.names(closePrices2) <- rm()
# closePrices2
shinyApp(
ui = fluidPage(
selectInput("compania", "Companyia:",
c("APPLE" = "AAPL.Close",
"Microsft" = "MSFT.Close",
"IBM" = "IBM.Close",
"Telefonica" = "TEF.Close"), state.name, multiple= T, selectize=T)
, tableOutput("data")
, dataTableOutput("DT")
),
server = function(input, output) {
output$data <- renderTable({
if( length(input$compania)> 0 ){
x <- closePrices2[ , c("Fecha" ,names(closePrices2)[ names(closePrices2) %in%
input$compania
# "TEF.Close"
] ) ]
x <- x[order(x[,2], decreasing = TRUE),][1:10,]
x$Fecha <- x$Fecha %>% as.Date %>% as.character()
x
}else{
closePrices2[ !1:nrow(closePrices2),]
}
}, rownames = FALSE)
output$DT <- renderDataTable({
if( length(input$compania)> 0 ){
x <- closePrices2[ , names(closePrices2) %in% c("Fecha", input$compania)]
x <- x[order(x[,1], decreasing = TRUE),]
x
}else{
x <- closePrices2
x <- x[ !1:nrow(closePrices2),]
x
}})
})
```
## Shiny {.tabset }
```{r Shiny}
closePrices3 <- closePrices
# closePrices2
plotdyg <- function(dataPlot){
dygraph(dataPlot, main = "None", group = "stock") %>%
dyRangeSelector(dateWindow = dateWindow)
}
shinyApp(
ui = fluidPage(
selectInput("compania", "Companyia:",
c("APPLE" = "AAPL.Close",
"Microsft" = "MSFT.Close",
"IBM" = "IBM.Close",
"Telefonica" = "TEF.Close"), state.name, multiple= T, selectize=T)
, dygraphOutput("dyg")
, dygraphOutput("dyg2")
# , dataTableOutput("DT")
),
server = function(input, output) {
output$dyg <- renderDygraph({
if( length(input$compania)> 0 ){
x <- closePrices3[ , names(closePrices3) %in% input$compania ]
plotdyg( x )
}else{
plotdyg( closePrices3[ !1:nrow(closePrices2),] )
}
})
output$dyg2 <- renderDygraph({plotdyg( closePrices3 )})
})
```
## Interactive Plot {.tabset }
```{r dygraphs2}
# https://shiny.rstudio.com/gallery/basic-datatable.html
closePrices4 <- closePrices
# closePrices2
plotdyg <- function(dataPlot){
dygraph(dataPlot, main = "None", group = "stock") %>%
dyRangeSelector(dateWindow = dateWindow)
}
selectInput(inputId = "compania", label = "Companyia:",
choices = c("APPLE" = "AAPL.Close",
"Microsft" = "MSFT.Close",
"IBM" = "IBM.Close",
"Telefonica" = "TEF.Close"), state.name, multiple= T, selectize=T)
renderDygraph({
if( length(input$compania)> 0 ){
x <- closePrices3[ , names(closePrices3) %in% input$compania ]
plotdyg( x )
}else{
plotdyg( closePrices3[ !1:nrow(closePrices2),] )
}
})
renderDygraph({plotdyg( closePrices3 )})
sliderInput("bins", "Number of bins:", min = 1, max = 50, value = 30)
renderPlot({
x <- faithful[, 2] # Old Faithful Geyser data
bins <- seq(min(x), max(x), length.out = input$bins + 1)
# draw the histogram with the specified number of bins
hist(x, breaks = bins, col = 'darkgray', border = 'white')
})
```
---
title: "Pruebas Shiny"
author: "jvigo6n"
date: "21 de enero de 2020"
output: html_document
# ioslides_presentation:
# keep_md: yes
# widescreen: yes
runtime: shiny
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
```
# La bolse y R (Paquete quantMode) {.tabset }
Vamos a hacer un mardown para tradear, para ver los nombres de las compañias aquí [YahooFinanciero](https://finance.yahoo.com/quote/TEF?p=TEF&.tsrc=fin-srch ).
## Genera Grupos de dygraph {.tabset }
```{r dygraphs sincro , echo=FALSE, error=FALSE, warning= FALSE, message=FALSE}
# https://finance.yahoo.com/quote/TEF?p=TEF&.tsrc=fin-srch
library(dplyr)
library(dygraphs)
library(DT)
library(quantmod)
tickers <- c("AAPL", "MSFT", "IBM", "TEF")
getSymbols(tickers)
closePrices <- do.call(merge, lapply(tickers, function(x) Cl(get(x))))
dateWindow <- c("2008-01-01", "2009-01-01")
dygraph(closePrices, main = "Value", group = "stock") %>%
dyRebase(value = 100) %>%
dyRangeSelector(dateWindow = dateWindow)
dygraph(closePrices, main = "Percent", group = "stock") %>%
dyRebase(percent = TRUE) %>%
dyRangeSelector(dateWindow = dateWindow)
dygraph(closePrices, main = "None", group = "stock2") %>%
dyRangeSelector(dateWindow = dateWindow)
```
## Interactive Table {.tabset }
```{r dygraphs}
# https://shiny.rstudio.com/gallery/basic-datatable.html
closePrices2 <- closePrices %>% data.frame()
closePrices2$Fecha <- index(closePrices)
closePrices2 <-
closePrices2[ , c("Fecha", names(closePrices2) [ names(closePrices2) != "Fecha"] )]
row.names(closePrices2) <- rm()
# closePrices2
shinyApp(
ui = fluidPage(
selectInput("compania", "Companyia:",
c("APPLE" = "AAPL.Close",
"Microsft" = "MSFT.Close",
"IBM" = "IBM.Close",
"Telefonica" = "TEF.Close"), state.name, multiple= T, selectize=T)
, tableOutput("data")
, dataTableOutput("DT")
),
server = function(input, output) {
output$data <- renderTable({
if( length(input$compania)> 0 ){
x <- closePrices2[ , c("Fecha" ,names(closePrices2)[ names(closePrices2) %in%
input$compania
# "TEF.Close"
] ) ]
x <- x[order(x[,2], decreasing = TRUE),][1:10,]
x$Fecha <- x$Fecha %>% as.Date %>% as.character()
x
}else{
closePrices2[ !1:nrow(closePrices2),]
}
}, rownames = FALSE)
output$DT <- renderDataTable({
if( length(input$compania)> 0 ){
x <- closePrices2[ , names(closePrices2) %in% c("Fecha", input$compania)]
x <- x[order(x[,1], decreasing = TRUE),]
x
}else{
x <- closePrices2
x <- x[ !1:nrow(closePrices2),]
x
}})
})
```
## Shiny {.tabset }
```{r Shiny}
closePrices3 <- closePrices
# closePrices2
plotdyg <- function(dataPlot){
dygraph(dataPlot, main = "None", group = "stock") %>%
dyRangeSelector(dateWindow = dateWindow)
}
shinyApp(
ui = fluidPage(
selectInput("compania", "Companyia:",
c("APPLE" = "AAPL.Close",
"Microsft" = "MSFT.Close",
"IBM" = "IBM.Close",
"Telefonica" = "TEF.Close"), state.name, multiple= T, selectize=T)
, dygraphOutput("dyg")
, dygraphOutput("dyg2")
# , dataTableOutput("DT")
),
server = function(input, output) {
output$dyg <- renderDygraph({
if( length(input$compania)> 0 ){
x <- closePrices3[ , names(closePrices3) %in% input$compania ]
plotdyg( x )
}else{
plotdyg( closePrices3[ !1:nrow(closePrices2),] )
}
})
output$dyg2 <- renderDygraph({plotdyg( closePrices3 )})
})
```
## Interactive Plot {.tabset }
```{r dygraphs2}
# https://shiny.rstudio.com/gallery/basic-datatable.html
closePrices4 <- closePrices
# closePrices2
plotdyg <- function(dataPlot){
dygraph(dataPlot, main = "None", group = "stock") %>%
dyRangeSelector(dateWindow = dateWindow)
}
selectInput(inputId = "compania", label = "Companyia:",
choices = c("APPLE" = "AAPL.Close",
"Microsft" = "MSFT.Close",
"IBM" = "IBM.Close",
"Telefonica" = "TEF.Close"), state.name, multiple= T, selectize=T)
renderDygraph({
if( length(input$compania)> 0 ){
x <- closePrices3[ , names(closePrices3) %in% input$compania ]
plotdyg( x )
}else{
plotdyg( closePrices3[ !1:nrow(closePrices2),] )
}
})
renderDygraph({plotdyg( closePrices3 )})
sliderInput("bins", "Number of bins:", min = 1, max = 50, value = 30)
renderPlot({
x <- faithful[, 2] # Old Faithful Geyser data
bins <- seq(min(x), max(x), length.out = input$bins + 1)
# draw the histogram with the specified number of bins
hist(x, breaks = bins, col = 'darkgray', border = 'white')
})
```
lunes, 29 de octubre de 2018
Gini para continuas volumen 2
GiniFunc <-
function(a, p){
df <- data.frame(a = a, p = p)
df <- df[ order(a, decreasing = FALSE),]
summary(df)
df2 <- data.frame(a = df$a , p = cumsum( df$p ))
df3 <- df2
df3$real <- df2$a/max(df2$a)
df3$prediccion <- df2$p/max(df2$p)
df3$a <- rm()
df3$p <- rm()
df3 <- rbind( data.frame(real =0, prediccion =0), df3)
summary(df3)
plot(x = df3$real, y = df3$prediccion , type = "l", col = "red")
abline(a = 0, b = 1 )
df3 <- df3[ !duplicated(df3) ,]
# print(df3)
# se resta la x y se suma la y. Es un trapezoide con base x[i+1] - x[i] y altura
giniDF <- data.frame( realEjeX = df3$real - Hmisc::Lag(df3$real, shift = 1)
, prediccionlEjeY_i = df3$prediccion
, prediccionlEjeY_iMenos1 =Hmisc::Lag(df3$prediccion, shift = 1)
)
giniDF$prediccionlEjeY_min <- apply(X = giniDF[, c("prediccionlEjeY_i", "prediccionlEjeY_iMenos1")], MARGIN = 1, FUN = min, na.rm = TRUE)
giniDF$prediccionlEjeY_max <- apply(X = giniDF[, c("prediccionlEjeY_i", "prediccionlEjeY_iMenos1")], MARGIN = 1, FUN = max, na.rm = TRUE)
giniDF$prediccionlEjeY_alturaRectangulo <- giniDF$prediccionlEjeY_min
giniDF$prediccionlEjeY_alturaTriangulo <- giniDF$prediccionlEjeY_max - giniDF$prediccionlEjeY_min
x <- giniDF$realEjeX * giniDF$prediccionlEjeY_alturaRectangulo + # Area rectangulo
(giniDF$realEjeX * giniDF$prediccionlEjeY_alturaTriangulo)/2 # Area triangulo
AUC <- sum(x, na.rm = TRUE) # Sumo todas las areas
return( list( AUC = AUC, Gini = abs(2 *AUC - 1), BaseDatosOrigianl = df, BasesDatosProporciones = df3) )
}
# 10 personas con riqueza parecida
z <- GiniFunc(a = 1:10, p = 100 + 1:10*10 )
z$Gini
z$AUC
# Gini 0
# 10 personas con la misma riqueza y una con mucha
z <- GiniFunc(a = rep(1,10), p = rep(10, 1e4) )
z$Gini
z$AUC
# Gini 0.9
# Relación entre uniformes
z <- GiniFunc(a = runif(1e3), p = runif(1e3) )
z$Gini
z$AUC
# Gini 0
# 10 personas con riqueza creciente o dos variables con el mismo valor
z <- GiniFunc(a = 1:10, p = 1:10)
z$Gini
z$AUC
# Una persona con toda la riqueza
N <- 100000
GiniFunc(a = rep(1, N), p = c(rep(0, N-1), 10))$AUC
GiniFunc(a = c(rep(0, N-1), 10), p = rep(1, N))$Gini
# Gini 0.99
# El caso de las dos uniformes
set.seed(123456)
x <-runif(1e3)
set.seed(123457)
y <-runif(1e3)
z <- GiniFunc(a =x , p = y)
z$Gini
z$AUC
# Gini 0
# Para modelos que es lo que importa.Aquí funciona como debería. Se compara la variable a predecir con la predicción
set.seed(123456)
x <-runif(1e3)
y <- 2* x + 3 + rnorm(1e3, mean = 0, sd = 0.01)
modelo <- lm( formula = y ~ x)
summary(modelo)
y_predict <- predict(object = modelo, newdata = data.frame(x))
z <- GiniFunc(a =y , p = y_predict)
z$Gini
z$AUC
cor(y, y_predict)
plot(y, y_predict)
abline(a = 0, b = 1, col = "red", lwd = 4)
# Gini 0.63
set.seed(123456)
x <- mtcars$disp
y <- mtcars$mpg
modelo <- lm( formula = y ~ x)
summary(modelo)
y_predict <- predict(object = modelo, newdata = data.frame(x))
z <- GiniFunc(a =y , p = y_predict)
abline( v = 0.65, col = "blue", lwd = 3 )
z$Gini
z$AUC
cor(y, y_predict)
plot(y, y_predict)
abline(a = 0, b = 1, col = "red", lwd = 4)
x <- y[ order( y )]
x <- cumsum(x)
x <- x/ max(x)
y[order(y)][ which( x > 0.65 )[1] ]
abline( v = y[order(y)][ which( x > 0.65 )[1] ], col = "blue", lwd = 3)
title("Los dos dejan de predecir en el mismo sitio")
# Gini 0.23 porque falla al final
function(a, p){
df <- data.frame(a = a, p = p)
df <- df[ order(a, decreasing = FALSE),]
summary(df)
df2 <- data.frame(a = df$a , p = cumsum( df$p ))
df3 <- df2
df3$real <- df2$a/max(df2$a)
df3$prediccion <- df2$p/max(df2$p)
df3$a <- rm()
df3$p <- rm()
df3 <- rbind( data.frame(real =0, prediccion =0), df3)
summary(df3)
plot(x = df3$real, y = df3$prediccion , type = "l", col = "red")
abline(a = 0, b = 1 )
df3 <- df3[ !duplicated(df3) ,]
# print(df3)
# se resta la x y se suma la y. Es un trapezoide con base x[i+1] - x[i] y altura
giniDF <- data.frame( realEjeX = df3$real - Hmisc::Lag(df3$real, shift = 1)
, prediccionlEjeY_i = df3$prediccion
, prediccionlEjeY_iMenos1 =Hmisc::Lag(df3$prediccion, shift = 1)
)
giniDF$prediccionlEjeY_min <- apply(X = giniDF[, c("prediccionlEjeY_i", "prediccionlEjeY_iMenos1")], MARGIN = 1, FUN = min, na.rm = TRUE)
giniDF$prediccionlEjeY_max <- apply(X = giniDF[, c("prediccionlEjeY_i", "prediccionlEjeY_iMenos1")], MARGIN = 1, FUN = max, na.rm = TRUE)
giniDF$prediccionlEjeY_alturaRectangulo <- giniDF$prediccionlEjeY_min
giniDF$prediccionlEjeY_alturaTriangulo <- giniDF$prediccionlEjeY_max - giniDF$prediccionlEjeY_min
x <- giniDF$realEjeX * giniDF$prediccionlEjeY_alturaRectangulo + # Area rectangulo
(giniDF$realEjeX * giniDF$prediccionlEjeY_alturaTriangulo)/2 # Area triangulo
AUC <- sum(x, na.rm = TRUE) # Sumo todas las areas
return( list( AUC = AUC, Gini = abs(2 *AUC - 1), BaseDatosOrigianl = df, BasesDatosProporciones = df3) )
}
# 10 personas con riqueza parecida
z <- GiniFunc(a = 1:10, p = 100 + 1:10*10 )
z$Gini
z$AUC
# Gini 0
# 10 personas con la misma riqueza y una con mucha
z <- GiniFunc(a = rep(1,10), p = rep(10, 1e4) )
z$Gini
z$AUC
# Gini 0.9
# Relación entre uniformes
z <- GiniFunc(a = runif(1e3), p = runif(1e3) )
z$Gini
z$AUC
# Gini 0
# 10 personas con riqueza creciente o dos variables con el mismo valor
z <- GiniFunc(a = 1:10, p = 1:10)
z$Gini
z$AUC
# Una persona con toda la riqueza
N <- 100000
GiniFunc(a = rep(1, N), p = c(rep(0, N-1), 10))$AUC
GiniFunc(a = c(rep(0, N-1), 10), p = rep(1, N))$Gini
# Gini 0.99
# El caso de las dos uniformes
set.seed(123456)
x <-runif(1e3)
set.seed(123457)
y <-runif(1e3)
z <- GiniFunc(a =x , p = y)
z$Gini
z$AUC
# Gini 0
# Para modelos que es lo que importa.Aquí funciona como debería. Se compara la variable a predecir con la predicción
set.seed(123456)
x <-runif(1e3)
y <- 2* x + 3 + rnorm(1e3, mean = 0, sd = 0.01)
modelo <- lm( formula = y ~ x)
summary(modelo)
y_predict <- predict(object = modelo, newdata = data.frame(x))
z <- GiniFunc(a =y , p = y_predict)
z$Gini
z$AUC
cor(y, y_predict)
plot(y, y_predict)
abline(a = 0, b = 1, col = "red", lwd = 4)
# Gini 0.63
set.seed(123456)
x <- mtcars$disp
y <- mtcars$mpg
modelo <- lm( formula = y ~ x)
summary(modelo)
y_predict <- predict(object = modelo, newdata = data.frame(x))
z <- GiniFunc(a =y , p = y_predict)
abline( v = 0.65, col = "blue", lwd = 3 )
z$Gini
z$AUC
cor(y, y_predict)
plot(y, y_predict)
abline(a = 0, b = 1, col = "red", lwd = 4)
x <- y[ order( y )]
x <- cumsum(x)
x <- x/ max(x)
y[order(y)][ which( x > 0.65 )[1] ]
abline( v = y[order(y)][ which( x > 0.65 )[1] ], col = "blue", lwd = 3)
title("Los dos dejan de predecir en el mismo sitio")
# Gini 0.23 porque falla al final
jueves, 25 de octubre de 2018
CV para xgboost
ROC <- function( ModelProb , test, SAVE = F, LABEL = "", DIR= "") {
require ('ROCR')
require ('rpart')
if(SAVE){
if(DIR != "" ){
if(substr(x = DIR, start = nchar(DIR), stop = nchar(DIR)) != "/" ){
DIR <- paste0(DIR,"/")
}
}
jpeg(filename = paste0(DIR ,LABEL,"ROC.jpeg") , width = 20, height = 20, units = "cm", res = 250, bg = "white" )
par(las = 1)
}
pred <- prediction (ModelProb, test [,ncol(test)])
perf <- performance (pred, 'tpr', 'fpr')
gini <- (unlist (performance (pred, measure = 'auc')@y.values) - .5) / .5
ks <- max (unlist (perf@y.values) - unlist (perf@x.values))
plot (x= 100 * unlist (perf@x.values), y = 100 * unlist (perf@y.values), type= 'l', lwd = 3
, col =2, xaxs = 'i', yaxs = 'i', yaxt = 'n', xlab = '% falsos positivos', ylab = '% verdaderos positivos')
abline (a=0, b=1, lty = 5)
legend ('bottomright', 'hi', paste0 ('Gini = ', round (100*gini, 1), '%\n', 'Ks = ', round (100*ks,1), '%\n\n'))
if(SAVE){
dev.off()
}
return( list(gini = gini , ks = ks, aucDf = data.frame( real = 100 * unlist (perf@x.values), y = 100 * unlist (perf@y.values))))
}
FuncionCV <-
function( dtrain_dt, labelVector
, PosEta = seq( from = 0.15, to = 0.35, by = 0.1)
, PosDep = seq( from = 1, to = 5, by = 2)
, PosLoops = seq( from = 200, to = 300, by = 50)
, objective = "binary:logistic"
, nfold = 10, missing = "NAN", eval_metric = "auc"){
ConfiDep <- c()
ConfiLoops <- c()
ConfiEta <- c()
AUCres <- c()
j <- 1
for(loops in PosLoops){
cat( "\n=====================\n", loops, "\n=====================\n" )
for(maxD in PosDep){
for(Ceta in PosEta){
model_check <- xgb.cv (data = dtrain,
label = label,
nrounds = loops,
eta = Ceta,
max_depth = maxD,
nfold = nfold, # Con 10 es que coge uno a 10.
objective = objective,
eval_metric = eval_metric,
pred = TRUE,
missing = missing)
val <- glmnet::auc( y = label
, prob = model_check$pred)
ROC(ModelProb = model_check$pred, test = data.frame(label), SAVE = FALSE )
title( paste0( "eta = ", Ceta, ", max_depth =", maxD,", nrounds = ",loops ) )
AUCres[j] <- val
ConfiDep[j] <- maxD
ConfiEta[j] <- Ceta
ConfiLoops[j] <- loops
j <- j + 1
}
}
}
Resumen <-
data.frame( AUCres = AUCres
, GiniRes = AUCres *2 -1
, ConfiDep = ConfiDep
, ConfiEta = ConfiEta
, ConfiLoops = ConfiLoops)
return( Resumen)
}
require ('ROCR')
require ('rpart')
if(SAVE){
if(DIR != "" ){
if(substr(x = DIR, start = nchar(DIR), stop = nchar(DIR)) != "/" ){
DIR <- paste0(DIR,"/")
}
}
jpeg(filename = paste0(DIR ,LABEL,"ROC.jpeg") , width = 20, height = 20, units = "cm", res = 250, bg = "white" )
par(las = 1)
}
pred <- prediction (ModelProb, test [,ncol(test)])
perf <- performance (pred, 'tpr', 'fpr')
gini <- (unlist (performance (pred, measure = 'auc')@y.values) - .5) / .5
ks <- max (unlist (perf@y.values) - unlist (perf@x.values))
plot (x= 100 * unlist (perf@x.values), y = 100 * unlist (perf@y.values), type= 'l', lwd = 3
, col =2, xaxs = 'i', yaxs = 'i', yaxt = 'n', xlab = '% falsos positivos', ylab = '% verdaderos positivos')
abline (a=0, b=1, lty = 5)
legend ('bottomright', 'hi', paste0 ('Gini = ', round (100*gini, 1), '%\n', 'Ks = ', round (100*ks,1), '%\n\n'))
if(SAVE){
dev.off()
}
return( list(gini = gini , ks = ks, aucDf = data.frame( real = 100 * unlist (perf@x.values), y = 100 * unlist (perf@y.values))))
}
FuncionCV <-
function( dtrain_dt, labelVector
, PosEta = seq( from = 0.15, to = 0.35, by = 0.1)
, PosDep = seq( from = 1, to = 5, by = 2)
, PosLoops = seq( from = 200, to = 300, by = 50)
, objective = "binary:logistic"
, nfold = 10, missing = "NAN", eval_metric = "auc"){
ConfiDep <- c()
ConfiLoops <- c()
ConfiEta <- c()
AUCres <- c()
j <- 1
for(loops in PosLoops){
cat( "\n=====================\n", loops, "\n=====================\n" )
for(maxD in PosDep){
for(Ceta in PosEta){
model_check <- xgb.cv (data = dtrain,
label = label,
nrounds = loops,
eta = Ceta,
max_depth = maxD,
nfold = nfold, # Con 10 es que coge uno a 10.
objective = objective,
eval_metric = eval_metric,
pred = TRUE,
missing = missing)
val <- glmnet::auc( y = label
, prob = model_check$pred)
ROC(ModelProb = model_check$pred, test = data.frame(label), SAVE = FALSE )
title( paste0( "eta = ", Ceta, ", max_depth =", maxD,", nrounds = ",loops ) )
AUCres[j] <- val
ConfiDep[j] <- maxD
ConfiEta[j] <- Ceta
ConfiLoops[j] <- loops
j <- j + 1
}
}
}
Resumen <-
data.frame( AUCres = AUCres
, GiniRes = AUCres *2 -1
, ConfiDep = ConfiDep
, ConfiEta = ConfiEta
, ConfiLoops = ConfiLoops)
return( Resumen)
}
lunes, 22 de octubre de 2018
Gini para continuas
El gini para continuas es un poco puñetero. Se puede generar un Gini de riqueza para simular el Gini, el problema es que las variables tienen que estar en la misma proporción.
GiniFunc <-
function(a, p){
df <- data.frame(a = a, p = p)
df <- df[ order(a),]
summary(df)
df2 <- data.frame(a = cumsum( df$a ), p = cumsum( df$p ))
cumsum( df$a )[1:10]
cumsum( df$p )[1:10]
plot( cumsum( df$a )[1:100],
cumsum( df$p )[1:100], type = "l")
abline(a = 0, b = 1)
df3 <- df2
df3$real <- df2$a/max(df2$a)
df3$prediccion <- df2$p/max(df2$p)
df3$a <- rm()
df3$p <- rm()
df3 <- rbind(data.frame(real =0, prediccion =0), df3)
df3 <- rbind(df3, data.frame(real =1, prediccion =1))
summary(df3)
plot(x = df3$real, y = df3$prediccion , type = "l", col = "red")
abline(a = 0, b = 1 )
df3 <- df3[ !duplicated(df3) ,]
# print(df3)
# se resta la x y se suma la y. Es un trapezoide con base x[i+1] - x[i] y altura
giniDF <- data.frame( realEjeX = df3$real - Hmisc::Lag(df3$real, shift = 1)
, prediccionlEjeY_i = df3$prediccion
, prediccionlEjeY_iMenos1 =Hmisc::Lag(df3$prediccion, shift = 1)
)
giniDF$prediccionlEjeY_min <- apply(X = giniDF[, c("prediccionlEjeY_i", "prediccionlEjeY_iMenos1")], MARGIN = 1, FUN = min, na.rm = TRUE)
giniDF$prediccionlEjeY_max <- apply(X = giniDF[, c("prediccionlEjeY_i", "prediccionlEjeY_iMenos1")], MARGIN = 1, FUN = max, na.rm = TRUE)
giniDF$prediccionlEjeY_alturaRectangulo <- giniDF$prediccionlEjeY_min
giniDF$prediccionlEjeY_alturaTriangulo <- giniDF$prediccionlEjeY_max - giniDF$prediccionlEjeY_min
x <- giniDF$realEjeX * giniDF$prediccionlEjeY_alturaRectangulo + # Area rectangulo
(giniDF$realEjeX * giniDF$prediccionlEjeY_alturaTriangulo)/2 # Area triangulo
AUC <- sum(x, na.rm = TRUE) # Sumo todas las areas
return( list( GiniCorrelacion = 1-(2*AUC-1), AUC = AUC, Gini = 2 *AUC - 1, BaseDatosOrigianl = df, BasesDatosProporciones = df3) )
}
# 10 personas con riqueza parecida
GiniFunc(a = 1:10, p = 100 + 1:10*10 )$GiniCorrelacion
# 10 personas con la misma riqueza
GiniFunc(a = rep(1,10), p = rep(10, 1e4) )$GiniCorrelacion
# 0
GiniFunc(a = runif(1e3), p = runif(1e3) )$GiniCorrelacion
# 10 personas con riqueza creciente o dos variables con el mismo valor
GiniFunc(a = 1:10, p = 1:10)
# 0
# Una persona con toda la riqueza
N <- 100000
GiniFunc(a = rep(1, N), p = c(rep(0, N-1), 10))$AUC
GiniFunc(a = c(rep(0, N-1), 10), p = rep(1, N))$Gini
# 1
# Este es el caso más raro. Con dos unifermos
set.seed(123456)
x <-runif(1e3)
set.seed(123457)
y <-runif(1e3)
z <- GiniFunc(a =x , p = y)
z$GiniCorrelacion
z$Gini
z$AUC
# Para modelos que es lo que importa.Aquí funciona como debería. Se compara la variable a predecir con la predicción
set.seed(123456)
x <-runif(1e3)
y <- 2* x + 3 + rnorm(1e3, mean = 0, sd = 0.01)
modelo <- lm( formula = y ~ x)
summary(modelo)
y_predict <- predict(object = modelo, newdata = data.frame(x))
z <- GiniFunc(a =y , p = y_predict)
z$GiniCorrelacion
z$Gini
z$AUC
cor(y, y_predict)
plot(y, y_predict)
abline(a = 0, b = 1, col = "red", lwd = 4)
set.seed(123456)
x <- mtcars$disp
y <- mtcars$mpg
modelo <- lm( formula = y ~ x)
summary(modelo)
y_predict <- predict(object = modelo, newdata = data.frame(x))
z <- GiniFunc(a =y , p = y_predict)
z$GiniCorrelacion
z$Gini
z$AUC
cor(y, y_predict)
plot(y, y_predict)
abline(a = 0, b = 1, col = "red", lwd = 4)
GiniFunc <-
function(a, p){
df <- data.frame(a = a, p = p)
df <- df[ order(a),]
summary(df)
df2 <- data.frame(a = cumsum( df$a ), p = cumsum( df$p ))
cumsum( df$a )[1:10]
cumsum( df$p )[1:10]
plot( cumsum( df$a )[1:100],
cumsum( df$p )[1:100], type = "l")
abline(a = 0, b = 1)
df3 <- df2
df3$real <- df2$a/max(df2$a)
df3$prediccion <- df2$p/max(df2$p)
df3$a <- rm()
df3$p <- rm()
df3 <- rbind(data.frame(real =0, prediccion =0), df3)
df3 <- rbind(df3, data.frame(real =1, prediccion =1))
summary(df3)
plot(x = df3$real, y = df3$prediccion , type = "l", col = "red")
abline(a = 0, b = 1 )
df3 <- df3[ !duplicated(df3) ,]
# print(df3)
# se resta la x y se suma la y. Es un trapezoide con base x[i+1] - x[i] y altura
giniDF <- data.frame( realEjeX = df3$real - Hmisc::Lag(df3$real, shift = 1)
, prediccionlEjeY_i = df3$prediccion
, prediccionlEjeY_iMenos1 =Hmisc::Lag(df3$prediccion, shift = 1)
)
giniDF$prediccionlEjeY_min <- apply(X = giniDF[, c("prediccionlEjeY_i", "prediccionlEjeY_iMenos1")], MARGIN = 1, FUN = min, na.rm = TRUE)
giniDF$prediccionlEjeY_max <- apply(X = giniDF[, c("prediccionlEjeY_i", "prediccionlEjeY_iMenos1")], MARGIN = 1, FUN = max, na.rm = TRUE)
giniDF$prediccionlEjeY_alturaRectangulo <- giniDF$prediccionlEjeY_min
giniDF$prediccionlEjeY_alturaTriangulo <- giniDF$prediccionlEjeY_max - giniDF$prediccionlEjeY_min
x <- giniDF$realEjeX * giniDF$prediccionlEjeY_alturaRectangulo + # Area rectangulo
(giniDF$realEjeX * giniDF$prediccionlEjeY_alturaTriangulo)/2 # Area triangulo
AUC <- sum(x, na.rm = TRUE) # Sumo todas las areas
return( list( GiniCorrelacion = 1-(2*AUC-1), AUC = AUC, Gini = 2 *AUC - 1, BaseDatosOrigianl = df, BasesDatosProporciones = df3) )
}
# 10 personas con riqueza parecida
GiniFunc(a = 1:10, p = 100 + 1:10*10 )$GiniCorrelacion
# 10 personas con la misma riqueza
GiniFunc(a = rep(1,10), p = rep(10, 1e4) )$GiniCorrelacion
# 0
GiniFunc(a = runif(1e3), p = runif(1e3) )$GiniCorrelacion
# 10 personas con riqueza creciente o dos variables con el mismo valor
GiniFunc(a = 1:10, p = 1:10)
# 0
# Una persona con toda la riqueza
N <- 100000
GiniFunc(a = rep(1, N), p = c(rep(0, N-1), 10))$AUC
GiniFunc(a = c(rep(0, N-1), 10), p = rep(1, N))$Gini
# 1
# Este es el caso más raro. Con dos unifermos
set.seed(123456)
x <-runif(1e3)
set.seed(123457)
y <-runif(1e3)
z <- GiniFunc(a =x , p = y)
z$GiniCorrelacion
z$Gini
z$AUC
# Para modelos que es lo que importa.Aquí funciona como debería. Se compara la variable a predecir con la predicción
set.seed(123456)
x <-runif(1e3)
y <- 2* x + 3 + rnorm(1e3, mean = 0, sd = 0.01)
modelo <- lm( formula = y ~ x)
summary(modelo)
y_predict <- predict(object = modelo, newdata = data.frame(x))
z <- GiniFunc(a =y , p = y_predict)
z$GiniCorrelacion
z$Gini
z$AUC
cor(y, y_predict)
plot(y, y_predict)
abline(a = 0, b = 1, col = "red", lwd = 4)
set.seed(123456)
x <- mtcars$disp
y <- mtcars$mpg
modelo <- lm( formula = y ~ x)
summary(modelo)
y_predict <- predict(object = modelo, newdata = data.frame(x))
z <- GiniFunc(a =y , p = y_predict)
z$GiniCorrelacion
z$Gini
z$AUC
cor(y, y_predict)
plot(y, y_predict)
abline(a = 0, b = 1, col = "red", lwd = 4)
miércoles, 3 de octubre de 2018
Scrapenado Ips por pais en R
library(rvest)
library(curl)
library(dplyr)
library(data.table)
t0 <-Sys.time()
listaIP <- c(
"5.188.10.8"
, "5.62.56.55"
, "5.62.58.55"
, "5.9.158.75"
) %>% unique
tablon <- data.table(ip = as.character(NA), inetnum = as.character(NA), Pais = as.character(NA),descr = as.character(NA)
, Tor = as.character(NA), amazon = as.character(NA), microsoft = as.character(NA) )
for( j in listaIP){
url <- paste0('http://whois.chromefans.org/', j)
Pagina <- html(curl(url, handle = curl::new_handle("useragent" = "Mozilla/5.0")))
# [contains(concat( " ", @class, " " ), concat( " ", "whois_info", " " ))]
x <- html_nodes(Pagina,'.whois_info')
funcionLimpia <- function(txt){
txt %>% gsub(x = ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement = "", perl=TRUE) %>%
gsub(pattern = "[^(a-z)]", replacement = " ", x = . ) %>% trimws
}
PalabrasClave <- html_nodes(x,'strong') %>% html_text() %>% tolower %>% gsub(x = ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement = "", perl=TRUE) %>%
gsub(pattern = "[^(a-z)]", replacement = " ", x = . ) %>% trimws
PalabrasClave2 <- paste0( "(", PalabrasClave, ")")
PalabrasClave2 <- PalabrasClave
PalabraClave <- paste(PalabrasClave2, collapse = "|")
a <-
html_text(x, trim = TRUE) %>% tolower %>%
gsub(pattern = "\n", replacement = " ", x = . , fixed = TRUE) %>%
gsub(x = ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement = "", perl=TRUE) %>%
strsplit(x = ., split = ":") %>% unlist()
a <- a[ a != ""]
i <- grep(pattern = "inetnum", x = (a) )[1]
inetnum <- a[i+1] %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper %>% trimws
i <- grep(pattern = "descr", x = (a), fixed = TRUE )[1]
descr <- a[i+1] %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper %>% trimws
i <- grep(pattern = "country", x = (a) )[1]
Pais <- a[i+1] %>% funcionLimpia %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper
i <- grep(pattern = "tor", x = (a) )
Tor <- paste0( a[i], collapse = " @CH@ " )
i <- grep(pattern = "amazon", x = (a) )
amazon <- paste0( a[i], collapse = " @CH@ " )
i <- grep(pattern = "microsoft", x = (a) )
microsoft <- paste0( a[i], collapse = " @CH@ " )
tablon <- rbind( tablon, data.table(ip = j, inetnum = inetnum, Pais = Pais, descr = descr, Tor = Tor, amazon = amazon, microsoft = microsoft))
espera <- ifelse( test = is.na(Pais) , yes = 1, no = sample(x = 5:15, size = 1) )
cat( "\nEsperamos ", espera,"\n")
Sys.sleep( time = espera) # segundos de delay para que la web no cante. Total no son muchos
cat( "\n", j,"terminado\n")
}
cat( "\n==============================================\n")
difftime( time1 = Sys.time(), time2 = t0, units = "m") %>% as.numeric %>% round(2) %>%
cat( "El proceso termino en ", ., " minutos\n")
cat( "\n==============================================\n")
tablon <- tablon[ !is.na(ip)]
library(curl)
library(dplyr)
library(data.table)
t0 <-Sys.time()
listaIP <- c(
"5.188.10.8"
, "5.62.56.55"
, "5.62.58.55"
, "5.9.158.75"
) %>% unique
tablon <- data.table(ip = as.character(NA), inetnum = as.character(NA), Pais = as.character(NA),descr = as.character(NA)
, Tor = as.character(NA), amazon = as.character(NA), microsoft = as.character(NA) )
for( j in listaIP){
url <- paste0('http://whois.chromefans.org/', j)
Pagina <- html(curl(url, handle = curl::new_handle("useragent" = "Mozilla/5.0")))
# [contains(concat( " ", @class, " " ), concat( " ", "whois_info", " " ))]
x <- html_nodes(Pagina,'.whois_info')
funcionLimpia <- function(txt){
txt %>% gsub(x = ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement = "", perl=TRUE) %>%
gsub(pattern = "[^(a-z)]", replacement = " ", x = . ) %>% trimws
}
PalabrasClave <- html_nodes(x,'strong') %>% html_text() %>% tolower %>% gsub(x = ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement = "", perl=TRUE) %>%
gsub(pattern = "[^(a-z)]", replacement = " ", x = . ) %>% trimws
PalabrasClave2 <- paste0( "(", PalabrasClave, ")")
PalabrasClave2 <- PalabrasClave
PalabraClave <- paste(PalabrasClave2, collapse = "|")
a <-
html_text(x, trim = TRUE) %>% tolower %>%
gsub(pattern = "\n", replacement = " ", x = . , fixed = TRUE) %>%
gsub(x = ., pattern = "(?<=[\\s])\\s*|^\\s+|\\s+$", replacement = "", perl=TRUE) %>%
strsplit(x = ., split = ":") %>% unlist()
a <- a[ a != ""]
i <- grep(pattern = "inetnum", x = (a) )[1]
inetnum <- a[i+1] %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper %>% trimws
i <- grep(pattern = "descr", x = (a), fixed = TRUE )[1]
descr <- a[i+1] %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper %>% trimws
i <- grep(pattern = "country", x = (a) )[1]
Pais <- a[i+1] %>% funcionLimpia %>% gsub(x = .,, pattern = PalabraClave, replacement = "") %>% toupper
i <- grep(pattern = "tor", x = (a) )
Tor <- paste0( a[i], collapse = " @CH@ " )
i <- grep(pattern = "amazon", x = (a) )
amazon <- paste0( a[i], collapse = " @CH@ " )
i <- grep(pattern = "microsoft", x = (a) )
microsoft <- paste0( a[i], collapse = " @CH@ " )
tablon <- rbind( tablon, data.table(ip = j, inetnum = inetnum, Pais = Pais, descr = descr, Tor = Tor, amazon = amazon, microsoft = microsoft))
espera <- ifelse( test = is.na(Pais) , yes = 1, no = sample(x = 5:15, size = 1) )
cat( "\nEsperamos ", espera,"\n")
Sys.sleep( time = espera) # segundos de delay para que la web no cante. Total no son muchos
cat( "\n", j,"terminado\n")
}
cat( "\n==============================================\n")
difftime( time1 = Sys.time(), time2 = t0, units = "m") %>% as.numeric %>% round(2) %>%
cat( "El proceso termino en ", ., " minutos\n")
cat( "\n==============================================\n")
tablon <- tablon[ !is.na(ip)]
miércoles, 29 de agosto de 2018
Descriptivas PySpark
En R es superfacil, Python, para variar, es un infierno. Pero siempre habrá quien defienda ese lenguaje no solo por su rapidez que la tiene, pero...
def descripcionDF (DFSpark, StringBool = True, timeStampBool = True, floatBool = True):
tipoVar = pd.DataFrame( DFSpark.dtypes, columns= ('variable', 'tipo') )
print( pd.crosstab(index=tipoVar.tipo, # Make a crosstab
columns="count") )
print( "========================================================================")
print( "Datos String")
print( "========================================================================")
x = tipoVar.variable [ tipoVar.tipo =='string' ].values.tolist()
if len(x) > 0 & StringBool:
for i in x: print( DFSpark.cube(i).count().orderBy(desc('count')).show() )
print( "========================================================================")
print( "Datos timeStamp")
print( "========================================================================")
x = tipoVar.variable [ tipoVar.tipo =='timestamp' ].values.tolist()
if len(x) > 0 & timeStampBool:
DFSparkFechas = DFSpark.select(x)
for i in x:
DFSparkFechas = DFSparkFechas.withColumn(i + '_Fecha', (year(i)* 1e10 + month(i)* 1e8 + dayofmonth(i)* 1e6 +\
hour(i)* 1e4 + minute(i) * 1e2 + second(i) ).cast(LongType()) )
print( DFSparkFechas.describe().show() )
print( "========================================================================")
print( "Datos float")
print( "========================================================================")
if len(tipoVar.variable [ tipoVar.tipo =='float' ]) > 0 & floatBool:
DFSpark.select(tipoVar.variable [ tipoVar.tipo =='float' ]).describe( ).show()
Suscribirse a:
Entradas (Atom)