In [1]:
import ipykernel
import folium
import pandas as pd
import numpy as np
from scipy import stats
import nvector
import math
import colorlover
from functools import lru_cache

%matplotlib inline

import plotly
plotly.offline.init_notebook_mode()
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import cufflinks as cf
cf.set_config_file(offline=True, world_readable=True, theme='pearl')
In [21]:
gps_points = pd.read_csv('./data/sample3.csv')
In [22]:
def extract_time(point):
    try:
        time_str = str(int(point['time_value']))
        time_str = "0"*(8-len(time_str)) + time_str
        datetime_str = str(int(point['date_value'])) + time_str
        return pd.to_datetime(datetime_str, format='%d%m%y%H%M%S%f')
    except Exception as e:
        # print(point, e)
        pass 

def GEO_2_ECEF(lat, lng, alt):
    wgs84 = nvector.FrameE(name='WGS84')
    point = wgs84.GeoPoint(latitude=lat, longitude=lng, z=alt, degrees=True)
    p_EB_E = point.to_ecef_vector()
    return p_EB_E.pvector.ravel()

def ECEF_2_GEO(x, y, z):
    wgs84 = nvector.FrameE(name='WGS84')
    pos = np.vstack((x, y, z))  # m
    point = wgs84.ECEFvector(pos).to_geo_point()
    return point.latitude_deg[0], point.longitude_deg[0], point.z[0]

def GEO_dist(lat, lng, alt, lat_ref, lng_ref, alt_ref):
    wgs84 = nvector.FrameE(name='WGS84')
    point = wgs84.GeoPoint(latitude=lat, longitude=lng, z=alt, degrees=True)
    point_ref = wgs84.GeoPoint(latitude=lat_ref, longitude=lng_ref, z=alt_ref, degrees=True)
    ellipsoidal_dist_12, _, _ = point_ref.distance_and_azimuth(point)
    return ellipsoidal_dist_12

def GEO_2_NEU(lat, lng, alt, lat_ref, lng_ref, alt_ref):
    wgs84 = nvector.FrameE(name='WGS84')
    pointA = wgs84.GeoPoint(latitude=lat_ref, longitude=lng_ref, z=alt_ref, degrees=True)
    pointB = wgs84.GeoPoint(latitude=lat, longitude=lng, z=alt, degrees=True)
    
    p_AB_ECEF = nvector.diff_positions(pointA, pointB)
    frame_NEU = nvector.FrameN(pointA)
    p_AB_NEU = p_AB_ECEF.change_frame(frame_NEU).pvector.ravel()
    return p_AB_NEU

def pre_processing(gps_points):    
    # Need more than 10 points to accept a path
    gps_points = gps_points.groupby('id').filter(lambda x: len(x) > 10) 
    # Need more than 3 satellites to accept a point 
    gps_points = gps_points[gps_points['satellites_value'] > 3]
    
    gps_points['time'] = gps_points.apply(extract_time, axis=1)
    gps_points = gps_points[gps_points['time'] > pd.Timestamp('2016-01-01')] # Remove Error

    def PD_GEO_2_ECEF(row):
        x, y, z = GEO_2_ECEF(row['location_lat'],
                             row['location_lng'],
                             row['location_alt'])
        return pd.Series({'location_x': x, 'location_y': y, 'location_z': z})
    
    ecef = gps_points.apply(PD_GEO_2_ECEF, axis=1)
    gps_points = gps_points.join(ecef)

    @lru_cache(maxsize=None)
    def get_center(path_id):
        path = gps_points[gps_points.id==path_id]
        return path[path.satellites_value >= 6].mean()[['location_x', 'location_y', 'location_z']]
    
    center_ecef = gps_points['id'].apply(get_center) 
    center_ecef.columns = ['center_x', 'center_y', 'center_z']
    gps_points = gps_points.join(center_ecef)


    def PD_ECEF_2_GEO(row):
        lat, lng, alt = ECEF_2_GEO(row['center_x'],
                                   row['center_y'],
                                   row['center_z'])
        return pd.Series({'center_lat': lat, 'center_lng': lng, 'center_alt': alt})

    center_geo = gps_points.apply(PD_ECEF_2_GEO, axis=1) 
    gps_points = gps_points.join(center_geo)
    
    def get_distance(row, flat=False):
        return GEO_dist(row['location_lat'],
                        row['location_lng'],
                        0 if flat else row['location_alt'],
                        row['center_lat'],
                        row['center_lng'],
                        0 if flat else  row['center_alt'])
    
    gps_points['distance'] = gps_points.apply(get_distance, axis=1)
    gps_points['distance_2D'] = gps_points.apply(get_distance, axis=1, flat=True)

    def PD_GEO_2_NEU(row):
        n, e, d = GEO_2_NEU(row['location_lat'], row['location_lng'], row['location_alt'],
                            row['center_lat'],   row['center_lng']  , row['center_alt'])
        return pd.Series({'north': n, 'east': e, 'up': d})

    neu = gps_points.apply(PD_GEO_2_NEU, axis=1)
    gps_points = gps_points.join(neu)
    
    gps_points['delta_lat'] = gps_points['location_lat'] - gps_points['center_lat']
    gps_points['delta_lng'] = gps_points['location_lng'] - gps_points['center_lng']
    gps_points['delta_alt'] = gps_points['location_alt'] - gps_points['center_alt']
    gps_points['delta_x'] = gps_points['location_x'] - gps_points['center_x']
    gps_points['delta_y'] = gps_points['location_y'] - gps_points['center_y']
    gps_points['delta_z'] = gps_points['location_z'] - gps_points['center_z']
    return gps_points

gps_points = pre_processing(gps_points)
In [4]:
gps_points.columns
Out[4]:
Index(['id', 'location_lat', 'location_lng', 'date_value', 'time_value',
       'speed_value', 'course_value', 'location_alt', 'satellites_value',
       'hdop_value', 'time', 'location_x', 'location_y', 'location_z',
       'center_x', 'center_y', 'center_z', 'center_alt', 'center_lat',
       'center_lng', 'distance_2D', 'down', 'east', 'north', 'delta_lat',
       'delta_lng', 'delta_alt', 'delta_x', 'delta_y', 'delta_z'],
      dtype='object')
In [4]:
groupedby = gps_points.groupby('satellites_value')
In [5]:
# https://plot.ly/~ben_derv/8/number-of-extracted-positions-by-satellites/
    
groupedby.size().iplot(
    kind='bar', 
    title="Number of extracted positions by satellites", 
    yTitle="Number of extracted positions",
    xTitle="Number of satellites",
    dimensions=(500,400)
)
In [6]:
# https://plot.ly/~ben_derv/12/distribution-of-recorded-positions/

def draw_points_by_satellites(path):
    iplot({
        'data': [
            {
                'x': path[path['satellites_value']==sat]['location_lng'],
                'y': path[path['satellites_value']==sat]['location_lat'],
                'name': sat, 
                'mode': 'markers',
            } for sat in range(3, path['satellites_value'].max()+1)
        ],
        'layout': {
            'xaxis': {'title': 'Longitude'},
            'yaxis': {'title': 'Latitude'},
            'title': 'Distribution of recorded positions'
        }
    })

draw_points_by_satellites(gps_points[gps_points.id==8])
In [7]:
# https://plot.ly/~ben_derv/4/cumulated-of-gps-record-by-distance/
def draw_distributed_cumsum(path):
    def distributed_cumsum(path):
        values, base = np.histogram(path['distance_2D'], bins=40)
        cumulative = np.cumsum(values)
        return cumulative/cumulative[-1], base


    iplot({
        'data': [{
                'y': distributed_cumsum(path[path['satellites_value'] == sat])[0],
                'x': distributed_cumsum(path[path['satellites_value'] == sat])[1],
                'name': sat, 
            } for sat in range(3, path['satellites_value'].max()+1)
        ],
        'layout': {
            'xaxis': {'title': 'Distance from center (meters)'},
            'yaxis': {'title': '% cumulated of positions recorded'},
            'title': 'Cumulated % of GPS record by distance '
        }
    })

draw_distributed_cumsum(gps_points)
In [8]:
v
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-e734a88a1110> in <module>()
----> 1 v

NameError: name 'v' is not defined
In [9]:
# https://plot.ly/~ben_derv/14/lat-vs-lng/

def draw_2d_distribution(path):
    scl = colorlover.scales['9']['seq']['Blues']
    colorscale = [ [ float(i)/float(len(scl)-1), scl[i] ] for i in range(len(scl)) ]

    path = path[path['distance_2D'] < 20]

    x_grid = np.linspace(
        path['location_lng'].min(),
        path['location_lng'].max(),
        100)
    y_grid = np.linspace(
        path['location_lat'].min(),
        path['location_lat'].max(),
        100)

    def kde_scipy(x, x_grid, bandwidth=0.2 ):
        kde = stats.gaussian_kde(x, bw_method=bandwidth / x.std(ddof=1) )
        return kde.evaluate(x_grid)

    trace1 = Histogram2dContour(
        x=path['location_lng'],
        y=path['location_lat'],
        name='density',
        ncontours=20,
        colorscale=colorscale,
        showscale=False
    )
    trace2 = Histogram(
        x=path['location_lng'],
        name='x density',
        yaxis='y2',
        histnorm='probability density',
        marker=dict(color='rgb(217, 217, 217)'),
        nbinsx=25
    )
    trace2s = Scatter(
        x=x_grid,
        y=kde_scipy( path['location_lng'].as_matrix(), x_grid ),
        yaxis='y2',
        line = dict( color='rgb(31, 119, 180)' ),
        fill='tonexty',
    )
    trace3 = Histogram(
        y=path['location_lat'],
        name='y density',
        xaxis='x2',
        histnorm='probability density',
        marker=dict(color='rgb(217, 217, 217)'),
        nbinsy=50
    )
    trace3s = Scatter(
        y=y_grid,
        x=kde_scipy( path['location_lat'].as_matrix(), y_grid ),
        xaxis='x2',
        line = dict( color='rgb(31, 119, 180)' ),
        fill='tonextx',
    )

    data = [trace1, trace2, trace2s, trace3, trace3s]

    layout = Layout(
        showlegend=False,
        autosize=False,
        width=700,
        height=700,
        hovermode='closest',
        bargap=0,

        xaxis=dict(domain=[0, 0.746], linewidth=2, linecolor='#444', title='Lng',
                   showgrid=False, zeroline=False, ticks='', showline=True, mirror=True),

        yaxis=dict(domain=[0, 0.746],linewidth=2, linecolor='#444', title='Lat',
                   showgrid=False, zeroline=False, ticks='', showline=True, mirror=True),

        xaxis2=dict(domain=[0.75, 1], showgrid=False, zeroline=False, ticks='',
                    showticklabels=False ),

        yaxis2=dict(domain=[0.75, 1], showgrid=False, zeroline=False, ticks='',
                    showticklabels=False ),
    )

    iplot(Figure(data=data, layout=layout))

draw_2d_distribution(gps_points)
In [10]:
#https://plot.ly/22/~ben_derv/#

groupedby.mean()['hdop_value'].iplot(
    kind='bar', 
    title="HDOP average by number of satellites", 
    xTitle="Number of satellites",
    yTitle='HDOP',
    dimensions=(800,500)
)
In [11]:
# LATITUDE

groupedby['north'].mean().iplot(
    kind='bar', 
    title="Average latitude distance from center by satellites", 
    xTitle='Number of satellites',
    yTitle='Latitude distance (meters)',
    dimensions=(500,400)
)

groupedby['north'].std().iplot(
    kind='bar', 
    title="Standard deviation of latitude distance from center by satellites", 
    xTitle='Number of satellites',
    yTitle='Latitude distance (meters)',
    dimensions=(500,400)
)
In [12]:
# LATITUDE

groupedby['east'].mean().iplot(
    kind='bar', 
    title="Average longitude distance from center by satellites", 
    xTitle='Number of satellites',
    yTitle='Longitude distance (meters)',
    dimensions=(500,400)
)

groupedby['east'].std().iplot(
    kind='bar', 
    title="STD of longitude distance from center by satellites", 
    xTitle='Number of satellites',
    yTitle='Longitude distance (meters)',
    dimensions=(500,400)
)
In [13]:
groupedby['up'].mean().iplot(
   kind='bar', 
   title="Altitude mean by number of satellites (meters)", 
   xTitle='Number of satellites',
   yTitle='Altitude distance',
   dimensions=(500,400)
)
groupedby['up'].std().iplot(
   kind='bar', 
   title="Altitude std by number of satellites (meters)", 
   xTitle='Number of satellites',
   yTitle='Altitude distance',
   dimensions=(500,400)
)
In [23]:
def auto_correlation(x):
    res = np.correlate(x, x, mode='full')
    return res / max(res)

path_time = gps_points.set_index(gps_points['time']).sort_index()
path = path_time[path_time.id==7]
iplot({'data': [
            {
                'y': auto_correlation(list(path[col].values - path[col].values.mean())),
                # 'x': (path_time.max()['time'] - path.time).astype('timedelta64[m]'),
                'name': col
            } for col in ['north', 'east', 'satellites_value', 'hdop_value']
        ],
        'layout': {
            'xaxis': {'title': 'Time shift (5 seconds delta)'},
            'yaxis': {'title': 'Normalized auto-correlation'},
            'title': 'Auto-correlation'
        } 
    })
In [14]:
gps_points = pd.read_csv('./data/sample.csv')

def create_map(data, filename='gps.html', max_points=100):
    center = [data['location_lat'].mean(), data['location_lng'].mean()]
    map_osm = folium.Map(location=center, zoom_start=18)
    for _, record in data.iterrows():
        if _ > max_points: 
            print('WARNING: DATASET TOO LARGE')
            break
        map_osm.add_children(folium.Marker(location=[record['location_lat'], record['location_lng']], popup=str(record['id'])))
    map_osm.save(outfile=filename)
    
path = gps_points[gps_points.id==452]
create_map(path)
In [15]:
%%HTML
<iframe width=100% height=500 src="gps.html"></iframe>
In [16]:
def linear_regression(path):
    slope, intercept, r_value, p_value, std_err = stats.linregress(path['location_lat'], path['location_lng'])
    line = slope*path['location_lat'] + intercept

    data = Data([{
        'x': path['location_lat'],
        'y': path['location_lng'],
        'mode': 'markers',
        'name':'Data'
    }, {
        'y': line, 
        'x': path['location_lat'],
        'name':'Linear Fit'
    }])
    layout = Layout(
        annotations = [Annotation(
            x=path['location_lat'].mean(),
            y=path['location_lng'].mean()+3*path['location_lng'].std(),
            text='r_value = {}'.format(r_value),
            showarrow=False,
            font=Font(size=16)
        )]
    )
    iplot(Figure(data=data, layout=layout))

linear_regression(path)
In [18]:
# https://plot.ly/~ben_derv/0/gps/
# This is a special record in 'mode 1', which record every 5 meters. It allow us to 'investigate' 
# the noise received by the satellites

gps_points = pd.read_csv('./data/sample2.csv')
draw_points_by_satellites(gps_points)
In [20]:
 
Out[20]:
id location_lat location_lng date_value time_value speed_value course_value location_alt satellites_value hdop_value
0 18 48.297161 4.069875 30616 22374294 0 0 291.6 4 910
1 18 48.296799 4.070173 30616 22374394 0 0 290.5 4 910
2 18 48.296494 4.070358 30616 22374494 0 0 270.2 4 910
3 18 48.296284 4.070485 30616 22374594 258 29670 251.3 4 900
4 18 48.296059 4.070591 30616 22374694 258 29670 233.2 4 900
5 18 48.295906 4.070608 30616 22374794 258 29670 218.2 4 900
6 18 48.295967 4.070442 30616 22374894 258 29670 215.9 4 900
7 18 48.295834 4.070537 30616 22374994 258 29670 209.6 4 900
8 18 48.295784 4.070530 30616 22375094 158 29904 207.1 4 900
9 18 48.295700 4.070621 30616 22375294 76 35772 205.4 4 900
10 18 48.295643 4.070581 30616 22375494 257 9439 195.0 4 900
11 18 48.295567 4.070560 30616 22375594 330 11246 185.4 4 900
12 18 48.295525 4.070503 30616 22375694 330 11246 178.5 4 900
13 18 48.295509 4.070347 30616 22375894 35 14841 168.2 4 900
14 18 48.295551 4.070287 30616 22375994 35 14841 168.3 4 900
15 18 48.295586 4.070219 30616 22380094 247 33189 166.9 4 900
16 18 48.295639 4.070083 30616 22380394 175 35169 164.4 4 900
17 18 48.295567 4.070048 30616 22380594 250 31675 154.1 4 900
18 18 48.295494 4.070078 30616 22380794 112 10370 146.9 4 900
19 18 48.295399 4.070192 30616 22380993 344 12301 140.3 4 900
20 18 48.295364 4.070237 30616 22381093 432 12034 138.2 4 900
21 18 48.295280 4.070298 30616 22381293 567 13094 133.0 4 890
22 18 48.295254 4.070394 30616 22381594 691 12194 132.5 4 890
23 18 48.295223 4.070453 30616 22381894 616 13700 131.9 4 890
24 18 48.295193 4.070530 30616 22382194 819 13416 133.9 4 890
25 18 48.295242 4.070473 30616 22382994 412 13667 136.2 4 890
26 18 48.295273 4.070352 30616 22383193 105 12721 133.4 4 890
27 18 48.295250 4.070282 30616 22383593 206 29154 125.1 4 890
28 18 48.295280 4.070202 30616 22384093 414 28738 127.1 4 890
29 18 48.295315 4.070110 30616 22384293 415 28472 128.5 4 880
... ... ... ... ... ... ... ... ... ... ...
10044 18 48.295544 4.069487 40616 8013700 200 29933 71.4 4 290
10045 18 48.295578 4.069435 40616 8014100 85 21915 64.8 4 290
10046 18 48.295609 4.069382 40616 8015000 178 16738 57.0 4 290
10047 18 48.295662 4.069370 40616 8015800 206 15015 63.1 4 280
10048 18 48.295696 4.069425 40616 8020300 228 14381 78.5 4 280
10049 18 48.295750 4.069470 40616 8020700 124 631 96.0 4 280
10050 18 48.295753 4.069398 40616 8022100 188 14751 86.3 3 290
10051 18 48.295795 4.069345 40616 8022600 64 32729 84.7 3 290
10052 18 48.295738 4.069360 40616 8024800 35 15891 83.3 3 290
10053 18 48.295769 4.069297 40616 8025600 139 33995 79.6 4 280
10054 18 48.295815 4.069318 40616 8030300 168 35916 86.1 4 280
10055 18 48.295807 4.069396 40616 8031100 71 3856 98.3 4 280
10056 18 48.295753 4.069438 40616 8031600 171 16054 102.0 4 280
10057 18 48.295712 4.069480 40616 8031900 238 13581 104.0 4 280
10058 18 48.295654 4.069470 40616 8033200 56 14335 88.0 3 730
10059 18 48.295601 4.069509 40616 8033400 223 14943 86.9 3 720
10060 18 48.295536 4.069551 40616 8033700 196 15560 85.3 3 720
10061 18 48.295471 4.069602 40616 8033900 458 14916 84.3 3 720
10062 18 48.295418 4.069637 40616 8034100 325 16139 83.3 3 720
10063 18 48.295372 4.069664 40616 8034400 233 16307 82.2 3 720
10064 18 48.295315 4.069702 40616 8034900 272 15101 81.2 3 720
10065 18 48.295368 4.069706 40616 8041000 67 12843 89.5 4 290
10066 18 48.295429 4.069667 40616 8041200 102 34541 92.4 4 290
10067 18 48.295494 4.069623 40616 8041400 256 34649 94.8 4 290
10068 18 48.295547 4.069620 40616 8042100 45 32220 101.8 4 290
10069 18 48.295593 4.069561 40616 8042400 158 30721 101.4 4 280
10070 18 48.295639 4.069512 40616 8042600 291 31530 101.3 4 280
10071 18 48.295700 4.069505 40616 8043600 46 32267 107.6 4 280
10072 18 48.295650 4.069595 40616 8044300 310 12920 115.8 4 280
10073 18 48.295616 4.069642 40616 8044400 310 12920 118.5 5 180

10074 rows × 10 columns

In [ ]: