[1]:
import eland as ed import pandas as pd import numpy as np import matplotlib.pyplot as plt from elasticsearch import Elasticsearch # Import standard test settings for consistent results from eland.conftest import *
Create an eland.DataFrame from a flights index
flights
[2]:
ed_flights = ed.DataFrame('localhost', 'flights')
[3]:
type(ed_flights)
eland.dataframe.DataFrame
Compare to pandas DataFrame (created from the same data)
[4]:
pd_flights = ed.eland_to_pandas(ed_flights)
[5]:
type(pd_flights)
pandas.core.frame.DataFrame
[6]:
pd_flights.columns
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName', 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek', 'timestamp'], dtype='object')
[7]:
ed_flights.columns
[8]:
pd_flights.dtypes
AvgTicketPrice float64 Cancelled bool Carrier object Dest object DestAirportID object ... OriginLocation object OriginRegion object OriginWeather object dayOfWeek int64 timestamp datetime64[ns] Length: 27, dtype: object
[9]:
ed_flights.dtypes
[10]:
pd_flights.select_dtypes(include=np.number)
13059 rows × 7 columns
[11]:
ed_flights.select_dtypes(include=np.number)
[12]:
pd_flights.empty
False
[13]:
ed_flights.empty
[14]:
pd_flights.shape
(13059, 27)
[15]:
ed_flights.shape
Note, eland.DataFrame.index does not mirror pandas.DataFrame.index.
eland.DataFrame.index
pandas.DataFrame.index
[16]:
pd_flights.index
Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ... '13049', '13050', '13051', '13052', '13053', '13054', '13055', '13056', '13057', '13058'], dtype='object', length=13059)
[17]:
# NBVAL_IGNORE_OUTPUT ed_flights.index
<eland.index.Index at 0x7f07752ad250>
[18]:
ed_flights.index.es_index_field
'_id'
Note, eland.DataFrame.values is not supported.
eland.DataFrame.values
[19]:
pd_flights.values
array([[841.2656419677076, False, 'Kibana Airlines', ..., 'Sunny', 0, Timestamp('2018-01-01 00:00:00')], [882.9826615595518, False, 'Logstash Airways', ..., 'Clear', 0, Timestamp('2018-01-01 18:27:00')], [190.6369038508356, False, 'Logstash Airways', ..., 'Rain', 0, Timestamp('2018-01-01 17:11:14')], ..., [997.7518761454494, False, 'Logstash Airways', ..., 'Sunny', 6, Timestamp('2018-02-11 04:09:27')], [1102.8144645388556, False, 'JetBeats', ..., 'Hail', 6, Timestamp('2018-02-11 08:28:21')], [858.1443369038839, False, 'JetBeats', ..., 'Rain', 6, Timestamp('2018-02-11 14:54:34')]], dtype=object)
[20]:
try: ed_flights.values except AttributeError as e: print(e)
This method would scan/scroll the entire Elasticsearch index(s) into memory. If this is explicitly required, and there is sufficient memory, call `ed.eland_to_pandas(ed_df).values`
[21]:
pd_flights.head()
5 rows × 27 columns
[22]:
ed_flights.head()
[23]:
pd_flights.tail()
[24]:
ed_flights.tail()
[25]:
pd_flights.keys()
[26]:
ed_flights.keys()
[27]:
pd_flights.get('Carrier')
0 Kibana Airlines 1 Logstash Airways 2 Logstash Airways 3 Kibana Airlines 4 Kibana Airlines ... 13054 Logstash Airways 13055 Logstash Airways 13056 Logstash Airways 13057 JetBeats 13058 JetBeats Name: Carrier, Length: 13059, dtype: object
[28]:
ed_flights.get('Carrier')
[29]:
pd_flights.get(['Carrier', 'Origin'])
13059 rows × 2 columns
List input not currently supported by eland.DataFrame.get
eland.DataFrame.get
[30]:
try: ed_flights.get(['Carrier', 'Origin']) except TypeError as e: print(e)
unhashable type: 'list'
[31]:
pd_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')
68 rows × 27 columns
eland.DataFrame.query requires qualifier on bool i.e.
eland.DataFrame.query
ed_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled') fails
ed_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled')
[32]:
ed_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')
[33]:
pd_flights[(pd_flights.Carrier=="Kibana Airlines") & (pd_flights.AvgTicketPrice > 900.0) & (pd_flights.Cancelled == True)]
[34]:
ed_flights[(ed_flights.Carrier=="Kibana Airlines") & (ed_flights.AvgTicketPrice > 900.0) & (ed_flights.Cancelled == True)]
[35]:
pd_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
eland.DataFrame.aggregate currently only supported numeric columns
eland.DataFrame.aggregate
[36]:
ed_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
[37]:
pd_flights.count()
AvgTicketPrice 13059 Cancelled 13059 Carrier 13059 Dest 13059 DestAirportID 13059 ... OriginLocation 13059 OriginRegion 13059 OriginWeather 13059 dayOfWeek 13059 timestamp 13059 Length: 27, dtype: int64
[38]:
ed_flights.count()
[39]:
pd_flights.describe()
8 rows × 7 columns
Values returned from eland.DataFrame.describe may vary due to results of Elasticsearch aggregations.
eland.DataFrame.describe
[40]:
# NBVAL_IGNORE_OUTPUT ed_flights.describe()
[41]:
pd_flights.info()
<class 'pandas.core.frame.DataFrame'> Index: 13059 entries, 0 to 13058 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AvgTicketPrice 13059 non-null float64 1 Cancelled 13059 non-null bool 2 Carrier 13059 non-null object 3 Dest 13059 non-null object 4 DestAirportID 13059 non-null object 5 DestCityName 13059 non-null object 6 DestCountry 13059 non-null object 7 DestLocation 13059 non-null object 8 DestRegion 13059 non-null object 9 DestWeather 13059 non-null object 10 DistanceKilometers 13059 non-null float64 11 DistanceMiles 13059 non-null float64 12 FlightDelay 13059 non-null bool 13 FlightDelayMin 13059 non-null int64 14 FlightDelayType 13059 non-null object 15 FlightNum 13059 non-null object 16 FlightTimeHour 13059 non-null float64 17 FlightTimeMin 13059 non-null float64 18 Origin 13059 non-null object 19 OriginAirportID 13059 non-null object 20 OriginCityName 13059 non-null object 21 OriginCountry 13059 non-null object 22 OriginLocation 13059 non-null object 23 OriginRegion 13059 non-null object 24 OriginWeather 13059 non-null object 25 dayOfWeek 13059 non-null int64 26 timestamp 13059 non-null datetime64[ns] dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17) memory usage: 3.2+ MB
[42]:
ed_flights.info()
<class 'eland.dataframe.DataFrame'> Index: 13059 entries, 0 to 13058 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AvgTicketPrice 13059 non-null float64 1 Cancelled 13059 non-null bool 2 Carrier 13059 non-null object 3 Dest 13059 non-null object 4 DestAirportID 13059 non-null object 5 DestCityName 13059 non-null object 6 DestCountry 13059 non-null object 7 DestLocation 13059 non-null object 8 DestRegion 13059 non-null object 9 DestWeather 13059 non-null object 10 DistanceKilometers 13059 non-null float64 11 DistanceMiles 13059 non-null float64 12 FlightDelay 13059 non-null bool 13 FlightDelayMin 13059 non-null int64 14 FlightDelayType 13059 non-null object 15 FlightNum 13059 non-null object 16 FlightTimeHour 13059 non-null float64 17 FlightTimeMin 13059 non-null float64 18 Origin 13059 non-null object 19 OriginAirportID 13059 non-null object 20 OriginCityName 13059 non-null object 21 OriginCountry 13059 non-null object 22 OriginLocation 13059 non-null object 23 OriginRegion 13059 non-null object 24 OriginWeather 13059 non-null object 25 dayOfWeek 13059 non-null int64 26 timestamp 13059 non-null datetime64[ns] dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17) memory usage: 64.0 bytes
[43]:
pd_flights.max(numeric_only=True)
AvgTicketPrice 1199.729053 Cancelled 1.000000 DistanceKilometers 19881.482315 DistanceMiles 12353.780369 FlightDelay 1.000000 FlightDelayMin 360.000000 FlightTimeHour 31.715034 FlightTimeMin 1902.902032 dayOfWeek 6.000000 dtype: float64
eland.DataFrame.max,min,mean,sum only aggregate numeric columns
eland.DataFrame.max,min,mean,sum
[44]:
ed_flights.max(numeric_only=True)
AvgTicketPrice 1199.729004 Cancelled 1.000000 DistanceKilometers 19881.482422 DistanceMiles 12353.780273 FlightDelay 1.000000 FlightDelayMin 360.000000 FlightTimeHour 31.715034 FlightTimeMin 1902.901978 dayOfWeek 6.000000 dtype: float64
[45]:
pd_flights.min(numeric_only=True)
AvgTicketPrice 100.020528 Cancelled 0.000000 DistanceKilometers 0.000000 DistanceMiles 0.000000 FlightDelay 0.000000 FlightDelayMin 0.000000 FlightTimeHour 0.000000 FlightTimeMin 0.000000 dayOfWeek 0.000000 dtype: float64
[46]:
ed_flights.min(numeric_only=True)
AvgTicketPrice 100.020531 Cancelled 0.000000 DistanceKilometers 0.000000 DistanceMiles 0.000000 FlightDelay 0.000000 FlightDelayMin 0.000000 FlightTimeHour 0.000000 FlightTimeMin 0.000000 dayOfWeek 0.000000 dtype: float64
[47]:
pd_flights.mean(numeric_only=True)
AvgTicketPrice 628.253689 Cancelled 0.128494 DistanceKilometers 7092.142455 DistanceMiles 4406.853013 FlightDelay 0.251168 FlightDelayMin 47.335171 FlightTimeHour 8.518797 FlightTimeMin 511.127842 dayOfWeek 2.835975 dtype: float64
[48]:
ed_flights.mean(numeric_only=True)
AvgTicketPrice 628.253689 Cancelled 0.128494 DistanceKilometers 7092.142457 DistanceMiles 4406.853010 FlightDelay 0.251168 FlightDelayMin 47.335171 FlightTimeHour 8.518797 FlightTimeMin 511.127842 dayOfWeek 2.835975 dtype: float64
[49]:
pd_flights.sum(numeric_only=True)
AvgTicketPrice 8.204365e+06 Cancelled 1.678000e+03 DistanceKilometers 9.261629e+07 DistanceMiles 5.754909e+07 FlightDelay 3.280000e+03 FlightDelayMin 6.181500e+05 FlightTimeHour 1.112470e+05 FlightTimeMin 6.674818e+06 dayOfWeek 3.703500e+04 dtype: float64
[50]:
ed_flights.sum(numeric_only=True)
[51]:
pd_flights[['Carrier', 'Origin', 'Dest']].nunique()
Carrier 4 Origin 156 Dest 156 dtype: int64
[52]:
ed_flights[['Carrier', 'Origin', 'Dest']].nunique()
[53]:
pd_flights.drop(columns=['AvgTicketPrice', 'Cancelled', 'DestLocation', 'Dest', 'DestAirportID', 'DestCityName', 'DestCountry'])
13059 rows × 20 columns
[54]:
ed_flights.drop(columns=['AvgTicketPrice', 'Cancelled', 'DestLocation', 'Dest', 'DestAirportID', 'DestCityName', 'DestCountry'])
[55]:
pd_flights.select_dtypes(include=np.number).hist(figsize=[10,10]) plt.show()
[56]:
ed_flights.select_dtypes(include=np.number).hist(figsize=[10,10]) plt.show()
[57]:
ed_flights2 = ed_flights[(ed_flights.OriginAirportID == 'AMS') & (ed_flights.FlightDelayMin > 60)] ed_flights2 = ed_flights2[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']] ed_flights2 = ed_flights2.tail()
[58]:
print(ed_flights2.es_info())
es_index_pattern: flights Index: es_index_field: _id is_source_field: False Mappings: capabilities: es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name timestamp timestamp True date strict_date_hour_minute_second datetime64[ns] True True False timestamp OriginAirportID OriginAirportID True keyword None object True True False OriginAirportID DestAirportID DestAirportID True keyword None object True True False DestAirportID FlightDelayMin FlightDelayMin True integer None int64 True True False FlightDelayMin Operations: tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))] size: 5 sort_params: _doc:desc _source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin'] body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}} post_processing: [('sort_index')]