Data Manipulation

In [1]:
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(dplyr)
library(stringr)
system("ls ../input")
data <- read_csv("F:/Documents/Fleet Data.csv")
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Parsed with column specification:
cols(
  `Parent Airline` = col_character(),
  Airline = col_character(),
  `Aircraft Type` = col_character(),
  Current = col_double(),
  Future = col_double(),
  Historic = col_double(),
  Total = col_double(),
  Orders = col_double(),
  `Unit Cost` = col_character(),
  `Total Cost (Current)` = col_character(),
  `Average Age` = col_double()
)
In [2]:
#making the data easier to deal with
colnames(data) <- c("parent_airline","airline","aircraft_type","current","future","historic","total","orders","unit_cost","total_cost","average_age")
#nrow(data)
data$total_cost <- as.numeric(gsub(",","",substring(data$total_cost,2)))
data$unit_cost <- as.numeric(gsub(",","",substring(data$unit_cost,2)))
data$plane_brand <- word(data$aircraft_type,1)
data <- data %>% mutate(newcost=ifelse(!is.na(orders),orders*unit_cost,0))
In [3]:
#adding flags for airbus and boeing
data <- data %>% mutate(airbus=ifelse(plane_brand=="Airbus",ifelse(is.na(current),0,current),0))
data <- data %>% mutate(boeing=ifelse(plane_brand=="Boeing",ifelse(is.na(current),0,current),0))
data <- data %>% mutate(embraer=ifelse(plane_brand=="Embraer",ifelse(is.na(current),0,current),0))
data <- data %>% mutate(bombardier=ifelse(plane_brand=="Bombardier",ifelse(is.na(current),0,current),0))
data <- data %>% mutate(mitsubishi=ifelse(plane_brand=="Mitsubishi",ifelse(is.na(current),0,current),0))
data <- data %>% mutate(embraer=ifelse(plane_brand=="Embraer",ifelse(is.na(current),0,current),0))
data <- data %>% mutate(comac=ifelse(plane_brand %in% c("COMAC","comac"),ifelse(is.na(current),0,current),0))
data <- data %>% mutate(mcdonnell=ifelse(plane_brand=="McDonnell",ifelse(is.na(current),0,current),0))
data <- data %>% mutate(canadair=ifelse(plane_brand=="Canadair",ifelse(is.na(current),0,current),0))
data <- data %>% mutate(atr=ifelse(plane_brand=="ATR",ifelse(is.na(current),0,current),0))
In [4]:
#table(data$plane_brand)

#adding flags for future airbus and future boeing
data <- data %>% mutate(airbus_future=ifelse(plane_brand=="Airbus",ifelse(is.na(orders),0,orders),0))
data <- data %>% mutate(boeing_future=ifelse(plane_brand=="Boeing",ifelse(is.na(orders),0,orders),0))

#The below list is not exhaustive.
lowcostcarriers<- c("AirAsia X","Gol Linhas Aereas","Azul","Tigerair Australia","Volaris","Interjet","Spirit Airlines","Frontier Airlines","Allegiant Air","WestJet","Air Canada Rouge","FlyDubai","Up","Monarch Airlines","Jet2","Flybe","Vueling","Iberia Express","Pobeda","Transavia","Germanwings","Eurowings","easyJet","Wizz Air","Lion Air","IndiGo", "Southwest Airlines", "RyanAir","AirAsia","LionAir","JetBlue Airways","Norwegian Air","WestJet","Pegasus Airlines","Flybe","SpiceJet","Air Arabia Egypt","Air Arabia Maroc","Air Cairo","Jambojet Limited","Mango","China United Airlines","Spring Airlines","Cebu Pacific Air","Jetstar Airways","AirAsia India","AirAsia Japan")
data$type <- NA
data <- data %>% mutate(type=ifelse(airline %in% lowcostcarriers,"Low Cost","Traditional"))

head(data %>% filter(type=="Low Cost"))
length(unique(data$airline))
length(unique(data$parent_airline))
parent_airlineairlineaircraft_typecurrentfuturehistorictotalordersunit_costtotal_cost...embraerbombardiermitsubishicomacmcdonnellcanadairatrairbus_futureboeing_futuretype
Aeroflot Pobeda Boeing 737 12 NA NA 12 NA 74 888 ... 0 0 0 0 0 0 0 0 0 Low Cost
Air Arabia Air Arabia EgyptAirbus A320 1 NA 2 3 NA 98 98 ... 0 0 0 0 0 0 0 0 0 Low Cost
Air Arabia Air Arabia MarocAirbus A320 5 NA 5 10 NA 98 490 ... 0 0 0 0 0 0 0 0 0 Low Cost
Air Canada Air Canada RougeAirbus A319 20 NA NA 20 NA 90 1792 ... 0 0 0 0 0 0 0 0 0 Low Cost
Air Canada Air Canada RougeAirbus A321 5 NA NA 5 NA 115 575 ... 0 0 0 0 0 0 0 0 0 Low Cost
Air Canada Air Canada RougeBoeing 767 19 NA NA 19 6 187 3551 ... 0 0 0 0 0 0 0 0 6 Low Cost
280
117

Let's look for largest airlines

In [5]:
#Let's find largest airlines

#lets find out which airlines have the largest fleets
airline_fleet <- data %>% group_by(airline) %>% summarise(fleet=sum(current,na.rm=TRUE),historic=sum(historic,na.rm=TRUE),orders=sum(orders,na.rm=TRUE),ratio=100*(orders/fleet),current_fleet_cost=sum(total_cost,na.rm=TRUE),value_per_fleet_unit=current_fleet_cost/fleet)
airline_fleet <- airline_fleet %>% mutate(type=ifelse(airline %in% lowcostcarriers,"Low Cost","Traditional")) 
head(airline_fleet %>% arrange(desc(fleet)))
airlinefleethistoricordersratiocurrent_fleet_costvalue_per_fleet_unittype
American Airlines 928 850 287 30.92672 103892 111.95259 Traditional
Delta Airlines 855 589 268 31.34503 79171 92.59766 Traditional
United Airlines 730 664 173 23.69863 90037 123.33836 Traditional
Southwest Airlines 718 195 284 39.55432 53118 73.98050 Low Cost
China Southern Airlines522 175 115 22.03065 61370 117.56705 Traditional
China Eastern Airlines 451 186 233 51.66297 51511 114.21508 Traditional

We see that the US leads, followed by China

Let's look for airline companies that exhibit Growth

In [6]:
#let's find out which airlines have the most orders
head(airline_fleet %>% arrange(desc(orders)))
airlinefleethistoricordersratiocurrent_fleet_costvalue_per_fleet_unittype
Lion Air 113 45 425 376.10619 9342 82.67257 Low Cost
IndiGo 124 16 420 338.70968 12152 98.00000 Low Cost
AirAsia 77 66 401 520.77922 7546 98.00000 Low Cost
American Airlines 928 850 287 30.92672 103892 111.95259 Traditional
Southwest Airlines718 195 284 39.55432 53118 73.98050 Low Cost
Delta Airlines 855 589 268 31.34503 79171 92.59766 Traditional

We see that the first three (Chinese companies) are growing at a faster rate if you look at the ratio between the current fleet and orders.

In [7]:
#ratio of orders vs current fleet for fleets more than 10
#head(airline_fleet %>% filter(fleet>10) %>% arrange(desc(ratio)))

#largest fleet costs carriers
#head(airline_fleet %>% filter(fleet>10) %>% arrange(desc(current_fleet_cost)))

#plot current vs future
g <- ggplot(airline_fleet,aes(x=fleet,y=orders,label=airline,color=as.factor(type)))+geom_point(size=0.5) +geom_text(size=2,hjust=0,vjust=0,nudge_x=7,nudge_y=-2)+ theme_minimal()
#geom_text(aes(label=rownames(airline_fleet)))
g + geom_smooth(aes(color=NA),method=lm,se=FALSE) + ggtitle("New Orders vs Existing Fleet Size")+labs(x="Current Fleet Size (Units)",y="New Orders (Units)")

You can see that Lion Air, IndiGo, and Air Asia are clear outliers

In [8]:
g <- ggplot(airline_fleet %>% filter(fleet<=200),aes(x=fleet,y=orders,label=airline,color=as.factor(type)))+geom_point(size=0.5) +geom_text(size=2,hjust=0,vjust=0,nudge_x=3,nudge_y=-2)+ theme_minimal()
#geom_text(aes(label=rownames(airline_fleet)))
g  + ggtitle("New Orders vs Existing Fleet Size")+labs(x="Current Fleet Size (Units)",y="New Orders (Units)")

When we pare down to fleet size < 200 units, the cluster becomes more clear

In [11]:
neworders <-data %>% filter(!is.na(orders)) %>% filter(orders>0) %>% group_by(aircraft_type) %>% summarise(orders=sum(orders),cost=mean(unit_cost)) 

newplanes<-c("Airbus A380","Airbus A350","Airbus A350 XWB","Airbus A350-900","Boeing 787","Boeing 787 Dreamliner")
neworders <- neworders %>% mutate(new=ifelse(aircraft_type%in%newplanes,"new","old"))

head(neworders %>% arrange(desc(orders)))

ggplot(neworders,aes(x=orders,y=cost,label=aircraft_type,color=(new)))+geom_point(size=0.5)+geom_text(size=2,hjust=0,vjust=0,nudge_x=20,nudge_y=-2)+ggtitle("New orders vs Cost")+labs(x="New Orders (units)",y="Unit Plane Cost (USD M)")+theme_minimal()
aircraft_typeorderscostnew
Airbus A320 2506 98 old
Boeing 737 2300 74 old
Airbus A321 1090 115 old
Boeing 787 Dreamliner 454 265 new
Boeing 777 423 295 old
Airbus A350 XWB 393 312 new
Warning message:
"Removed 1 rows containing missing values (geom_point)."Warning message:
"Removed 1 rows containing missing values (geom_text)."
This demonstrates that newer planes are quite expensive, being 2-3x the cost of a cheaper airplane like the Boeing 737 (one in the crashes).
It also demonstrates that newer planes sell less.
I would posit that the market looks for low-cost carriers at cost efficiency.
Let's compare the Boeing 737 to an Airbus.
In [25]:
buyingcheap <- data %>% filter(!is.na(orders)) %>% filter(orders>0) %>% filter(aircraft_type %in% c("Airbus A320","Boeing 737")) %>% group_by(airline) %>% summarise(orders=sum(orders))
head(buyingcheap %>% arrange(desc(orders)))
airlineorders
Lion Air 425
IndiGo 400
AirAsia 301
Southwest Airlines 284
RyanAir 205
China Eastern Airlines171
In [26]:
buying380 <- data %>% filter(!is.na(orders)) %>% filter(orders>0) %>% filter(aircraft_type=="Airbus A380") %>% group_by(airline) %>% summarise(orders=sum(orders))
head(buying380 %>% arrange(desc(orders)))
airlineorders
Emirates 54
China Southern Airlines17
Singapore Airlines 5
Qatar Airways 4
All Nippon Airways 3
Etihad Airways 2
In [27]:
buying787DL <- data %>% filter(!is.na(orders)) %>% filter(orders>0) %>% filter(aircraft_type=="Boeing 737") %>% group_by(airline) %>% summarise(orders=sum(orders))
head(buying787DL %>% arrange(desc(orders)))
airlineorders
Southwest Airlines284
RyanAir 205
Lion Air 204
Gol Linhas Aéreas 134
American Airlines 121
United Airlines 103

We see who's buying the Boeing 737 above

In [15]:
current_aircrafts <- data %>% filter(!is.na(current)) %>% filter(current>0) %>% group_by(plane_brand) %>% summarise(current=sum(current),cost=sum(total_cost,na.rm=TRUE))
head(current_aircrafts %>% arrange(desc(current)))
plane_brandcurrentcost
Boeing 8773 1179299
Airbus 7440 983031
Embraer 1370 52968
Canadair1039 41590
De 525 16407
ATR 491 10784

Plane manufacturers and mix by airline

Seems Boeing is largely responsible for today's fleets

In [16]:
bycompany <- data %>% group_by(airline) %>% summarise(total=sum(current,na.rm=TRUE),airbus=sum(airbus),boeing=sum(boeing),airbusper=round(100*airbus/total),boeingper=round(100*boeing/total),airbusadd=sum(airbus_future),boeingadd=sum(boeing_future),newtotal=total+airbusadd+boeingadd,newairbusper=round(100*(airbus+airbusadd)/newtotal))

bycompany <- bycompany %>% mutate(type=ifelse(airline %in% lowcostcarriers,"Low Cost","Traditional"))
head(bycompany %>% filter(total> 300) %>% arrange(desc(boeingper)))
head(bycompany %>% filter(total> 300) %>% arrange(desc(airbusper)))
airlinetotalairbusboeingairbusperboeingperairbusaddboeingaddnewtotalnewairbuspertype
RyanAir 366 0 366 0 100 0 205 571 0 Low Cost
Southwest Airlines718 0 718 0 100 0 284 1002 0 Low Cost
United Airlines 730 156 574 21 79 3 146 879 18 Traditional
Delta Airlines 855 180 483 21 56 119 74 1048 29 Traditional
Air China 375 185 190 49 51 71 84 530 48 Traditional
American Airlines 928 397 453 43 49 144 143 1215 45 Traditional
airlinetotalairbusboeingairbusperboeingperairbusaddboeingaddnewtotalnewairbuspertype
China Eastern Airlines 451 310 141 69 31 152 61 664 70 Traditional
China Southern Airlines522 288 214 55 41 82 13 617 60 Traditional
Air China 375 185 190 49 51 71 84 530 48 Traditional
American Airlines 928 397 453 43 49 144 143 1215 45 Traditional
Delta Airlines 855 180 483 21 56 119 74 1048 29 Traditional
United Airlines 730 156 574 21 79 3 146 879 18 Traditional
This can be indicative of clear preferences/partnerships depending on the company, for instance Ryan AIr and Southwest Airlines exclusively use Boeing. THis is not the case for Airbus. Not to mention that the highest percentage of Airbus are Chinese companies.
In [17]:
ggplot(data=bycompany,aes(x=airbusper,y=newairbusper,label=airline,color=type))+geom_point(size=0.5)+geom_text(size=2,hjust=0,vjust=0,nudge_x=1,nudge_y=-1)+theme_minimal()+ggtitle("Evolution (Rough Estimation) of Airbus Mix in the Future")+labs(x="Current Fleet Percentage of Airbus Planes (%)")+labs(y="Estimated Future Fleet Percentage of Airbus Planes (%)")
Let's compare the two in terms of sales. Airbus and Boeing seem to have a duopoly. The third is a tenth of Boeing for instance.
In [18]:
business <-data %>% filter(!is.na(orders)) %>% filter(orders>0) %>% group_by(plane_brand) %>% summarise(orders=sum(orders),cost=sum(newcost))

head(business %>% arrange(desc(cost)))
plane_brandorderscost
Airbus 4665 669940
Boeing 3450 480513
Embraer 325 13424
Bombardier 186 12762
Mitsubishi 220 10340
COMAC 80 5440

I'm going to attempt PCA (principal component analysis) to convert observations to variables that may correlate into linearly uncorrelated variables called principal components.

This enables us to view whether airlines have diversity in their fleets. UA is mostly just Airbus and Boeing.
In [19]:
fleets <- data %>% group_by(airline) %>% summarise(total=sum(current,na.rm=TRUE),airbus=sum(airbus),boeing=sum(boeing),embraer=sum(embraer),bombardier=sum(bombardier),atr=sum(atr),mcdonnell=sum(mcdonnell),canadair=sum(canadair))
#head(fleets)
#summary(fleets)
fleets<-as.data.frame(fleets)
row.names(fleets)<-fleets[,1]
fleets.pca <- prcomp(fleets[,3:ncol(fleets)],center=TRUE,scale=TRUE)
print(fleets.pca)
biplot(fleets.pca,cex=0.4,xlim=c(-0.2,0.6))
Standard deviations (1, .., p=7):
[1] 1.3342262 1.1528182 1.0588512 0.9712625 0.8330210 0.7772184 0.7268709

Rotation (n x k) = (7 x 7):
                   PC1         PC2         PC3        PC4        PC5        PC6
airbus      0.53826839 -0.02899423  0.20117794 -0.2047453  0.5405947 -0.4944229
boeing      0.60234086 -0.08737791  0.05479419 -0.1008275 -0.1733711  0.0719462
embraer    -0.10273959 -0.68126060  0.19173360  0.1278067  0.5056911  0.4587332
bombardier -0.07582006 -0.09804146 -0.58208725 -0.7781204  0.1322595  0.1511226
atr         0.04561279 -0.21516687 -0.71886429  0.5198873  0.1586036 -0.3504436
mcdonnell   0.54288640 -0.22079250 -0.18626712  0.1165498 -0.4020552  0.3761182
canadair   -0.18531688 -0.65019243  0.17168687 -0.2055252 -0.4665621 -0.5028119
                    PC7
airbus      0.300560456
boeing     -0.762330768
embraer    -0.077926528
bombardier -0.003409579
atr        -0.128872678
mcdonnell   0.552387916
canadair    0.026276718

Summary