#import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#read the dataset and check the first five rows
df0 = pd.read_csv('googleplaystore.csv')


# Check the shape of the dataframe
df0.shape

(10841, 13)


#check the first five rows
df0.head()


#Check the datatypes of all the columns of the dataframe
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


#Check the number of null values in the columns
df0.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64


#Drop the rows having null values in the Rating field
df1 = df0[~df0.Rating.isnull()]

#Check the shape of the dataframe
df1.shape

(9367, 13)


# Check the number of nulls in the Rating field again to cross-verify
df1.Rating.isnull().sum()

0


#Check the number of nulls in the dataframe again and find the total number of null values
df1.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       4
Android Ver       3
dtype: int64


#Inspect the nulls in the Android Version column
df1[df1['Android Ver'].isnull()]


# Remove the record which has incorrect entry i.e 10,472 row
df1 = df1[~(df1['Android Ver'].isnull() & (df1['Category']=='1.9'))]


# Cross-verify whether record is removed or not
df1[df1['Android Ver'].isnull()]


#Check the most common value in the Android version column
df1['Android Ver'].value_counts()

4.1 and up            2059
Varies with device    1319
4.0.3 and up          1240
4.0 and up            1131
4.4 and up             875
2.3 and up             582
5.0 and up             535
4.2 and up             338
2.3.3 and up           240
3.0 and up             211
2.2 and up             208
4.3 and up             207
2.1 and up             113
1.6 and up              87
6.0 and up              48
7.0 and up              41
3.2 and up              31
2.0 and up              27
5.1 and up              18
1.5 and up              16
3.1 and up               8
2.0.1 and up             7
4.4W and up              6
8.0 and up               5
7.1 and up               3
5.0 - 8.0                2
4.0.3 - 7.1.1            2
1.0 and up               2
5.0 - 6.0                1
4.1 - 7.1.1              1
7.0 - 7.1.1              1
Name: Android Ver, dtype: int64


# Android ver is categorical type variable
#Fill up the nulls in the Android Version column with the above most occuring value
df1['Android Ver'] = df1['Android Ver'].fillna(df1['Android Ver'].mode()[0])


#Check the nulls in the Android version column again to cross-verify
df1['Android Ver'].value_counts()

4.1 and up            2061
Varies with device    1319
4.0.3 and up          1240
4.0 and up            1131
4.4 and up             875
2.3 and up             582
5.0 and up             535
4.2 and up             338
2.3.3 and up           240
3.0 and up             211
2.2 and up             208
4.3 and up             207
2.1 and up             113
1.6 and up              87
6.0 and up              48
7.0 and up              41
3.2 and up              31
2.0 and up              27
5.1 and up              18
1.5 and up              16
3.1 and up               8
2.0.1 and up             7
4.4W and up              6
8.0 and up               5
7.1 and up               3
5.0 - 8.0                2
4.0.3 - 7.1.1            2
1.0 and up               2
5.0 - 6.0                1
4.1 - 7.1.1              1
7.0 - 7.1.1              1
Name: Android Ver, dtype: int64


df1['Android Ver'].isnull().sum()

0


#Check the nulls in the entire dataframe again
df1.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       4
Android Ver       0
dtype: int64


#Check the most common value in the Current version column
df1['Current Ver'].value_counts()

Varies with device    1415
1.0                    458
1.1                    195
1.2                    126
1.3                    120
                      ... 
4.5.15                   1
5.4.7                    1
0.46.0                   1
4.5.2.1                  1
0.0.52                   1
Name: Current Ver, Length: 2638, dtype: int64


#Replace the nulls in the Current version column with the above most occuring value
df1['Current Ver'] = df1['Current Ver'].fillna(df1['Current Ver'].mode()[0])


df1['Current Ver'].value_counts()

Varies with device    1419
1.0                    458
1.1                    195
1.2                    126
1.3                    120
                      ... 
4.5.15                   1
5.4.7                    1
0.46.0                   1
4.5.2.1                  1
0.0.52                   1
Name: Current Ver, Length: 2638, dtype: int64


df1['Current Ver'].value_counts()

Varies with device    1419
1.0                    458
1.1                    195
1.2                    126
1.3                    120
                      ... 
4.5.15                   1
5.4.7                    1
0.46.0                   1
4.5.2.1                  1
0.0.52                   1
Name: Current Ver, Length: 2638, dtype: int64


df1['Current Ver'].isnull().sum()

0


df1.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object


df1.head(1)


# Convert Price column data type by float and handle '$' sign
df1.Price = df1.Price.apply(lambda x: 0 if x == "0" else float(x[1:]))


df1.Price.dtype

dtype('float64')


#Analyse the Installs Column
df1.Installs.head()

0        10,000+
1       500,000+
2     5,000,000+
3    50,000,000+
4       100,000+
Name: Installs, dtype: object


df1.Installs.dtypes

dtype('O')


# Convert Installs column data type by int and handle ',' sign
df1.Installs = df1.Installs.apply(lambda x: int(x.replace(",","")[:-1]))


df1.Installs.dtypes

dtype('int64')


df1.Reviews = df1.Reviews.apply(lambda x: int(x))


df1.Reviews.dtypes

dtype('int64')


# Number of Reviews is less than or equal to the number of Installs
df1 = df1[df1.Reviews <= df1.Installs]


# Perform the sanity checks on prices of free apps 
df1[(df1.Type == 'Free') & (df1.Price > 0)]


#import the plotting libraries
import pandas as pd
import matplotlib.pyplot as plt


#Check the apps with price more than 200
df1 = df1[df1.Price < 200]


#Check the apps with price more than 30
df1[df1.Price > 30].Price.plot.box()

<AxesSubplot:>


#Create a histogram of the Reviews
plt.hist(df1['Reviews'], bins=100)
plt.show()


#Check records with 1 million reviews
df1 = df1[df1.Reviews <= 1000000]
df1.shape

(8640, 13)


# CLean the Installs by removing all the apps having more than or equal to 100 million installs
df1 = df1[df1.Installs <= 100000000]
df1.shape

(8630, 13)


df1.head()


import seaborn as sns


import warnings
warnings.filterwarnings('ignore')


plt.style.use('dark_background')
sns.displot(df1.Rating, bins=20, color='r')
plt.title('Distribution of app ratings', fontsize=12)
plt.show()


#Analyse the Content Rating column
df1['Content Rating'].value_counts()

Everyone           6944
Teen                928
Mature 17+          417
Everyone 10+        337
Adults only 18+       3
Unrated               1
Name: Content Rating, dtype: int64


df1 = df1[~df1['Content Rating'].isin(['Adults only 18+','Unrated'])]


df1["Content Rating"].value_counts()

Everyone        6944
Teen             928
Mature 17+       417
Everyone 10+     337
Name: Content Rating, dtype: int64


#Plot a pie chart
df1['Content Rating'].value_counts().plot.pie()
plt.show()


#Plot a bar chart
df1['Content Rating'].value_counts().plot.bar()

<AxesSubplot:>


df1['Android Ver'].value_counts().plot.bar()

<AxesSubplot:>


###Size vs Rating

##Plot a scatter-plot in the matplotlib way between Size and Rating

plt.scatter(df1.Size, df1.Rating)
plt.xlabel('Size (KB)')
plt.ylabel('Rating')
plt.show()


### Plot the same thing now using a jointplot
sns.jointplot(df1.Size, df1.Rating)
plt.show()


##Plot a reg plot for Price and Rating and observe the trend
plt.style.use('ggplot')
sns.jointplot(df1.Price, df1.Rating, kind='reg')
plt.show()


## Create a pair plot for Reviews, Size, Price and Rating
sns.pairplot(df1[['Reviews','Size','Price','Rating']])
plt.show()


##Plot a bar plot of Content Rating vs Average Rating 
df1.groupby(by=['Content Rating'])['Rating'].mean().plot.bar()

<AxesSubplot:xlabel='Content Rating'>


##Plot the bar plot again with Median Rating

df1.groupby(by=['Content Rating'])['Rating'].median().plot.bar()

<AxesSubplot:xlabel='Content Rating'>


##Plot the above bar plot using the estimator parameter
sns.barplot(data=df1, x = 'Content Rating', y = 'Rating', estimator=np.median)
plt.show()


sns.barplot(data=df1, x = 'Content Rating', y = 'Rating', estimator=np.min)
plt.show()


##Plot a box plot of Rating vs Content Rating
plt.figure(figsize=[9,7])
sns.boxplot(df1['Content Rating'], df1.Rating)
plt.show()


# Plot a box plot for the Rating column only
plt.figure(figsize=(9,7))
sns.boxplot(df1.Rating)
plt.show()


df1.Genres.value_counts()

Tools                     694
Entertainment             508
Education                 464
Medical                   349
Finance                   315
                         ... 
Arcade;Pretend Play         1
Parenting;Brain Games       1
Lifestyle;Pretend Play      1
Lifestyle;Education         1
Racing;Pretend Play         1
Name: Genres, Length: 115, dtype: int64


df_top4 = df1[df1['Genres'].isin(['Tools','Entertainment','Education','Medical'])]


# Plot a box plot of Ratings across the 4 most popular Genres
plt.figure(figsize=(9,7))
sns.set_theme(style='darkgrid')
sns.boxplot(x='Genres', y='Rating', data=df_top4)
plt.show()


## Extract the month from the Last Updated Date
df1['Last Updated'].head()
df1['updated_month'] = pd.to_datetime(df1['Last Updated']).dt.month


## Find the average Rating across all the months
plt.figure(figsize=[10,5])
df1.groupby(by=['updated_month'])['Rating'].mean().plot()
plt.ylabel('Rating')
plt.show()


## Create a pivot table for Content Rating and updated Month with the values set to Installs
monthly = pd.pivot_table(data=df1, values="Installs", index='updated_month', columns="Content Rating", aggfunc=sum)


monthly.plot(kind='bar', stacked='True', figsize=[10,6])

<AxesSubplot:xlabel='updated_month'>


##Plot the stacked bar chart again wrt to the proportions.
monthly_prop = monthly[['Everyone', 'Everyone 10+', 'Mature 17+', 'Teen']].apply(lambda x : x/x.sum(), axis=1)
monthly_prop.plot(kind='bar', stacked='True', figsize=[10,6])

<AxesSubplot:xlabel='updated_month'>

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver
4453	[substratum] Vacuum: P	PERSONALIZATION	4.4	230	11M	1,000+	Paid	$1.49	Everyone	Personalization	July 20, 2018	4.4	NaN
4490	Pi Dark [substratum]	PERSONALIZATION	4.5	189	2.1M	10,000+	Free	0	Everyone	Personalization	March 27, 2018	1.1	NaN
10472	Life Made WI-Fi Touchscreen Photo Frame	1.9	19.0	3.0M	1,000+	Free	0	Everyone	NaN	February 11, 2018	1.0.19	4.0 and up	NaN

Google Playstore Case Study¶

Data Handling and Cleaning¶

Handling Incorrect Data Types¶

Sanity Checks¶

Outliers Analysis Using Boxplot¶

Histograms¶

Data Visualisation with Seaborn¶

Distribution Plots¶

Scatter Plots¶

Pair Plots¶

Top 4 most popular Genres¶

Line Plots¶

Stacked Bar Charts¶

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver
0	Photo Editor & Candy Camera & Grid & ScrapBook	ART_AND_DESIGN	4.1	159	19M	10,000+	Free	Everyone	Art & Design	January 7, 2018	1.0.0	4.0.3 and up
1	Coloring book moana	ART_AND_DESIGN	3.9	967	14M	500,000+	Free	Everyone	Art & Design;Pretend Play	January 15, 2018	2.0.0	4.0.3 and up
2	U Launcher Lite – FREE Live Cool Themes, Hide ...	ART_AND_DESIGN	4.7	87510	8.7M	5,000,000+	Free	Everyone	Art & Design	August 1, 2018	1.2.4	4.0.3 and up
3	Sketch - Draw & Paint	ART_AND_DESIGN	4.5	215644	25M	50,000,000+	Free	Teen	Art & Design	June 8, 2018	Varies with device	4.2 and up
4	Pixel Draw - Number Art Coloring Book	ART_AND_DESIGN	4.3	967	2.8M	100,000+	Free	Everyone	Art & Design;Creativity	June 20, 2018	1.1	4.4 and up

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver
0	Photo Editor & Candy Camera & Grid & ScrapBook	ART_AND_DESIGN	4.1	159	19M	10000	Free	Everyone	Art & Design	January 7, 2018	1.0.0	4.0.3 and up
1	Coloring book moana	ART_AND_DESIGN	3.9	967	14M	500000	Free	Everyone	Art & Design;Pretend Play	January 15, 2018	2.0.0	4.0.3 and up
2	U Launcher Lite – FREE Live Cool Themes, Hide ...	ART_AND_DESIGN	4.7	87510	8.7M	5000000	Free	Everyone	Art & Design	August 1, 2018	1.2.4	4.0.3 and up
3	Sketch - Draw & Paint	ART_AND_DESIGN	4.5	215644	25M	50000000	Free	Teen	Art & Design	June 8, 2018	Varies with device	4.2 and up
4	Pixel Draw - Number Art Coloring Book	ART_AND_DESIGN	4.3	967	2.8M	100000	Free	Everyone	Art & Design;Creativity	June 20, 2018	1.1	4.4 and up