Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Sign in
Toggle navigation
A
alpha-mind
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Dr.李
alpha-mind
Commits
0a6d9643
Commit
0a6d9643
authored
May 10, 2020
by
Dr.李
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
FEATURE: added winsorize wth interpolation
parent
23dbdadd
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
175 additions
and
13 deletions
+175
-13
winsorize.py
alphamind/data/winsorize.py
+117
-13
test_winsorize.py
alphamind/tests/data/test_winsorize.py
+58
-0
No files found.
alphamind/data/winsorize.py
View file @
0a6d9643
...
@@ -35,6 +35,46 @@ def mask_values_2d(x: np.ndarray,
...
@@ -35,6 +35,46 @@ def mask_values_2d(x: np.ndarray,
return
res
return
res
@
nb
.
njit
(
nogil
=
True
,
cache
=
True
)
def
interp_values_2d
(
x
:
np
.
ndarray
,
groups
:
np
.
ndarray
,
mean_values
:
np
.
ndarray
,
std_values
:
np
.
ndarray
,
num_stds
:
int
=
3
,
interval
:
float
=
0.5
)
->
np
.
ndarray
:
res
=
x
.
copy
()
length
,
width
=
x
.
shape
max_cat
=
np
.
max
(
groups
)
for
k
in
range
(
max_cat
+
1
):
target_idx
=
np
.
where
(
groups
==
k
)[
0
]
.
flatten
()
for
j
in
range
(
width
):
target_x
=
x
[
target_idx
,
j
]
target_res
=
target_x
.
copy
()
mean
=
mean_values
[
target_idx
[
0
],
j
]
std
=
std_values
[
target_idx
[
0
],
j
]
ubound
=
mean
+
num_stds
*
std
lbound
=
mean
-
num_stds
*
std
# upper bound abnormal values
idx
=
target_x
>
ubound
n
=
np
.
sum
(
idx
)
if
n
>
0
:
u_values
=
target_res
[
idx
]
q_values
=
u_values
.
argsort
()
.
argsort
()
target_res
[
idx
]
=
ubound
+
q_values
/
n
*
interval
*
std
# lower bound abnormal values
idx
=
target_x
<
lbound
n
=
np
.
sum
(
idx
)
if
n
>
0
:
l_values
=
target_res
[
idx
]
q_values
=
(
-
l_values
)
.
argsort
()
.
argsort
()
target_res
[
idx
]
=
lbound
-
q_values
/
n
*
interval
*
std
res
[
target_idx
,
j
]
=
target_res
return
res
@
nb
.
njit
(
nogil
=
True
,
cache
=
True
)
@
nb
.
njit
(
nogil
=
True
,
cache
=
True
)
def
mask_values_1d
(
x
:
np
.
ndarray
,
def
mask_values_1d
(
x
:
np
.
ndarray
,
mean_values
:
np
.
ndarray
,
mean_values
:
np
.
ndarray
,
...
@@ -46,38 +86,76 @@ def mask_values_1d(x: np.ndarray,
...
@@ -46,38 +86,76 @@ def mask_values_1d(x: np.ndarray,
for
j
in
range
(
width
):
for
j
in
range
(
width
):
ubound
=
mean_values
[
j
]
+
num_stds
*
std_values
[
j
]
ubound
=
mean_values
[
j
]
+
num_stds
*
std_values
[
j
]
lbound
=
mean_values
[
j
]
-
num_stds
*
std_values
[
j
]
lbound
=
mean_values
[
j
]
-
num_stds
*
std_values
[
j
]
for
i
in
range
(
length
):
res
[
x
[:,
j
]
>
ubound
,
j
]
=
ubound
if
x
[
i
,
j
]
>
ubound
:
res
[
x
[:,
j
]
<
lbound
,
j
]
=
lbound
res
[
i
,
j
]
=
ubound
return
res
elif
x
[
i
,
j
]
<
lbound
:
res
[
i
,
j
]
=
lbound
@
nb
.
njit
(
nogil
=
True
,
cache
=
True
)
def
interp_values_1d
(
x
:
np
.
ndarray
,
mean_values
:
np
.
ndarray
,
std_values
:
np
.
ndarray
,
num_stds
:
int
=
3
,
interval
:
float
=
0.5
)
->
np
.
ndarray
:
res
=
x
.
copy
()
length
,
width
=
x
.
shape
for
j
in
range
(
width
):
ubound
=
mean_values
[
j
]
+
num_stds
*
std_values
[
j
]
lbound
=
mean_values
[
j
]
-
num_stds
*
std_values
[
j
]
# upper bound abnormal values
idx
=
x
[:,
j
]
>
ubound
n
=
np
.
sum
(
idx
)
if
n
>
0
:
u_values
=
res
[
idx
,
j
]
q_values
=
u_values
.
argsort
()
.
argsort
()
res
[
idx
,
j
]
=
ubound
+
q_values
/
n
*
interval
*
std_values
[
j
]
# lower bound abnormal values
idx
=
x
[:,
j
]
<
lbound
n
=
np
.
sum
(
idx
)
if
n
>
0
:
l_values
=
res
[
idx
,
j
]
q_values
=
(
-
l_values
)
.
argsort
()
.
argsort
()
res
[
idx
,
j
]
=
lbound
-
q_values
/
n
*
interval
*
std_values
[
j
]
return
res
return
res
def
winsorize_normal
(
x
:
np
.
ndarray
,
num_stds
:
int
=
3
,
ddof
=
1
,
def
winsorize_normal
(
x
:
np
.
ndarray
,
num_stds
:
int
=
3
,
ddof
=
1
,
groups
:
np
.
ndarray
=
None
,
groups
:
np
.
ndarray
=
None
,
fill_
method
:
str
=
'flat'
,
method
:
str
=
'flat'
,
fill_interval
:
in
t
=
0.5
)
->
np
.
ndarray
:
interval
:
floa
t
=
0.5
)
->
np
.
ndarray
:
if
groups
is
not
None
:
if
groups
is
not
None
:
groups
=
group_mapping
(
groups
)
groups
=
group_mapping
(
groups
)
mean_values
=
transform
(
groups
,
x
,
'mean'
)
mean_values
=
transform
(
groups
,
x
,
'mean'
)
std_values
=
transform
(
groups
,
x
,
'std'
,
ddof
)
std_values
=
transform
(
groups
,
x
,
'std'
,
ddof
)
if
method
==
'flat'
:
res
=
mask_values_2d
(
x
,
mean_values
,
std_values
,
num_stds
)
res
=
mask_values_2d
(
x
,
mean_values
,
std_values
,
num_stds
)
else
:
res
=
interp_values_2d
(
x
,
groups
,
mean_values
,
std_values
,
num_stds
,
interval
)
else
:
else
:
std_values
=
simple_std
(
x
,
axis
=
0
,
ddof
=
ddof
)
std_values
=
simple_std
(
x
,
axis
=
0
,
ddof
=
ddof
)
mean_values
=
simple_mean
(
x
,
axis
=
0
)
mean_values
=
simple_mean
(
x
,
axis
=
0
)
if
method
==
'flat'
:
res
=
mask_values_1d
(
x
,
mean_values
,
std_values
,
num_stds
)
res
=
mask_values_1d
(
x
,
mean_values
,
std_values
,
num_stds
)
else
:
res
=
interp_values_1d
(
x
,
mean_values
,
std_values
,
num_stds
,
interval
)
return
res
return
res
class
NormalWinsorizer
(
object
):
class
NormalWinsorizer
(
object
):
def
__init__
(
self
,
num_stds
:
int
=
3
,
ddof
=
1
):
def
__init__
(
self
,
num_stds
:
int
=
3
,
ddof
:
int
=
1
,
method
:
str
=
'flat'
,
interval
:
float
=
0.5
):
self
.
num_stds
=
num_stds
self
.
num_stds
=
num_stds
self
.
ddof
=
ddof
self
.
ddof
=
ddof
self
.
mean
=
None
self
.
mean
=
None
self
.
std
=
None
self
.
std
=
None
self
.
labels
=
None
self
.
labels
=
None
self
.
method
=
method
self
.
interval
=
interval
def
fit
(
self
,
x
:
np
.
ndarray
,
groups
:
np
.
ndarray
=
None
):
def
fit
(
self
,
x
:
np
.
ndarray
,
groups
:
np
.
ndarray
=
None
):
if
groups
is
not
None
:
if
groups
is
not
None
:
...
@@ -92,9 +170,35 @@ class NormalWinsorizer(object):
...
@@ -92,9 +170,35 @@ class NormalWinsorizer(object):
def
transform
(
self
,
x
:
np
.
ndarray
,
groups
:
np
.
ndarray
=
None
)
->
np
.
ndarray
:
def
transform
(
self
,
x
:
np
.
ndarray
,
groups
:
np
.
ndarray
=
None
)
->
np
.
ndarray
:
if
groups
is
not
None
:
if
groups
is
not
None
:
index
=
array_index
(
self
.
labels
,
groups
)
index
=
array_index
(
self
.
labels
,
groups
)
return
mask_values_2d
(
x
,
self
.
mean
[
index
],
self
.
std
[
index
],
self
.
num_stds
)
if
self
.
method
==
'flat'
:
res
=
mask_values_2d
(
x
,
self
.
mean
[
index
],
self
.
std
[
index
],
self
.
num_stds
)
else
:
res
=
interp_values_2d
(
x
,
groups
,
self
.
mean
[
index
],
self
.
std
[
index
],
self
.
num_stds
,
self
.
interval
)
else
:
else
:
return
mask_values_1d
(
x
,
self
.
mean
,
self
.
std
,
self
.
num_stds
)
if
self
.
method
==
'flat'
:
res
=
mask_values_1d
(
x
,
self
.
mean
,
self
.
std
,
self
.
num_stds
)
else
:
res
=
interp_values_1d
(
x
,
self
.
mean
,
self
.
std
,
self
.
num_stds
,
self
.
interval
)
return
res
def
__call__
(
self
,
x
:
np
.
ndarray
,
groups
:
np
.
ndarray
=
None
)
->
np
.
ndarray
:
def
__call__
(
self
,
x
:
np
.
ndarray
,
groups
:
np
.
ndarray
=
None
)
->
np
.
ndarray
:
return
winsorize_normal
(
x
,
self
.
num_stds
,
self
.
ddof
,
groups
)
return
winsorize_normal
(
x
,
self
.
num_stds
,
self
.
ddof
,
groups
,
self
.
method
,
self
.
interval
)
if
__name__
==
'__main__'
:
x
=
np
.
random
.
randn
(
10000
,
1
)
groups
=
np
.
random
.
randint
(
0
,
3
,
10000
)
import
datetime
as
dt
start
=
dt
.
datetime
.
now
()
for
i
in
range
(
1000
):
winsorize_normal
(
x
,
method
=
'flat'
)
print
(
dt
.
datetime
.
now
()
-
start
)
start
=
dt
.
datetime
.
now
()
for
i
in
range
(
1000
):
winsorize_normal
(
x
,
method
=
'interp'
)
print
(
dt
.
datetime
.
now
()
-
start
)
alphamind/tests/data/test_winsorize.py
View file @
0a6d9643
...
@@ -17,6 +17,7 @@ from alphamind.data.winsorize import winsorize_normal
...
@@ -17,6 +17,7 @@ from alphamind.data.winsorize import winsorize_normal
class
TestWinsorize
(
unittest
.
TestCase
):
class
TestWinsorize
(
unittest
.
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
np
.
random
.
seed
(
10
)
self
.
x
=
np
.
random
.
randn
(
3000
,
10
)
self
.
x
=
np
.
random
.
randn
(
3000
,
10
)
self
.
groups
=
np
.
random
.
randint
(
10
,
30
,
size
=
3000
)
self
.
groups
=
np
.
random
.
randint
(
10
,
30
,
size
=
3000
)
self
.
num_stds
=
2
self
.
num_stds
=
2
...
@@ -38,6 +39,33 @@ class TestWinsorize(unittest.TestCase):
...
@@ -38,6 +39,33 @@ class TestWinsorize(unittest.TestCase):
calculated_col
=
calc_winsorized
[:,
i
]
calculated_col
=
calc_winsorized
[:,
i
]
np
.
testing
.
assert_array_almost_equal
(
col_data
,
calculated_col
)
np
.
testing
.
assert_array_almost_equal
(
col_data
,
calculated_col
)
def
test_winsorize_normal_with_interp
(
self
):
calc_winsorized
=
winsorize_normal
(
self
.
x
,
self
.
num_stds
,
method
=
'interp'
)
std_values
=
self
.
x
.
std
(
axis
=
0
,
ddof
=
1
)
mean_value
=
self
.
x
.
mean
(
axis
=
0
)
lower_bound
=
mean_value
-
self
.
num_stds
*
std_values
upper_bound
=
mean_value
+
self
.
num_stds
*
std_values
for
i
in
range
(
np
.
size
(
calc_winsorized
,
1
)):
col_data
=
self
.
x
[:,
i
]
.
copy
()
idx
=
col_data
>
upper_bound
[
i
]
u_values
=
col_data
[
idx
]
q_values
=
u_values
.
argsort
()
.
argsort
()
if
len
(
q_values
)
>
0
:
col_data
[
idx
]
=
upper_bound
[
i
]
+
q_values
/
len
(
q_values
)
*
0.5
*
std_values
[
i
]
idx
=
col_data
<
lower_bound
[
i
]
l_values
=
col_data
[
idx
]
q_values
=
(
-
l_values
)
.
argsort
()
.
argsort
()
if
len
(
q_values
)
>
0
:
col_data
[
idx
]
=
lower_bound
[
i
]
-
q_values
/
len
(
q_values
)
*
0.5
*
std_values
[
i
]
calculated_col
=
calc_winsorized
[:,
i
]
np
.
testing
.
assert_array_almost_equal
(
col_data
,
calculated_col
)
def
test_winsorize_normal_with_group
(
self
):
def
test_winsorize_normal_with_group
(
self
):
cal_winsorized
=
winsorize_normal
(
self
.
x
,
self
.
num_stds
,
groups
=
self
.
groups
)
cal_winsorized
=
winsorize_normal
(
self
.
x
,
self
.
num_stds
,
groups
=
self
.
groups
)
...
@@ -55,6 +83,36 @@ class TestWinsorize(unittest.TestCase):
...
@@ -55,6 +83,36 @@ class TestWinsorize(unittest.TestCase):
exp_winsorized
=
pd
.
DataFrame
(
self
.
x
)
.
groupby
(
self
.
groups
)
.
transform
(
impl
)
.
values
exp_winsorized
=
pd
.
DataFrame
(
self
.
x
)
.
groupby
(
self
.
groups
)
.
transform
(
impl
)
.
values
np
.
testing
.
assert_array_almost_equal
(
cal_winsorized
,
exp_winsorized
)
np
.
testing
.
assert_array_almost_equal
(
cal_winsorized
,
exp_winsorized
)
def
test_winsorize_normal_with_group_and_interp
(
self
):
cal_winsorized
=
winsorize_normal
(
self
.
x
,
self
.
num_stds
,
groups
=
self
.
groups
,
method
=
'interp'
)
def
impl
(
x
):
x
=
x
.
values
std_values
=
x
.
std
(
axis
=
0
,
ddof
=
1
)
mean_value
=
x
.
mean
(
axis
=
0
)
lower_bound
=
mean_value
-
self
.
num_stds
*
std_values
upper_bound
=
mean_value
+
self
.
num_stds
*
std_values
col_data
=
x
.
copy
()
idx
=
col_data
>
upper_bound
u_values
=
col_data
[
idx
]
q_values
=
u_values
.
argsort
()
.
argsort
()
if
len
(
q_values
)
>
0
:
col_data
[
idx
]
=
upper_bound
+
q_values
/
len
(
q_values
)
*
0.5
*
std_values
idx
=
col_data
<
lower_bound
l_values
=
col_data
[
idx
]
q_values
=
(
-
l_values
)
.
argsort
()
.
argsort
()
if
len
(
q_values
)
>
0
:
col_data
[
idx
]
=
lower_bound
-
q_values
/
len
(
q_values
)
*
0.5
*
std_values
return
col_data
exp_winsorized
=
pd
.
DataFrame
(
self
.
x
)
.
groupby
(
self
.
groups
)
.
transform
(
impl
)
.
values
np
.
testing
.
assert_array_almost_equal
(
cal_winsorized
,
exp_winsorized
)
def
test_normal_winsorizer
(
self
):
def
test_normal_winsorizer
(
self
):
s
=
NormalWinsorizer
(
num_stds
=
self
.
num_stds
)
s
=
NormalWinsorizer
(
num_stds
=
self
.
num_stds
)
s
.
fit
(
self
.
x
)
s
.
fit
(
self
.
x
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment