Exploratory Data Analysis Project

12 minute read

png

import pandas as pd
import numpy as np

df = pd.read_csv('Books with genre.csv')

df.head()

	bookID	title	title_desc	Genre_1	Genre_2	Genre_3	authors	average_rating	isbn	isbn13	language_code	num_pages	ratings_count	text_reviews_count	publication_date	publisher
0	1	Harry Potter and the Half-Blood Prince	Harry Potter #6)	Fantasy	Young adult literature	Fiction	J.K. Rowling/Mary GrandPrÃ©	4.57	439785960	9.780000e+12	eng	652	2095690	27591	9/16/2006	Scholastic Inc.
1	2	Harry Potter and the Order of the Phoenix	Harry Potter #5)	Fantasy	Young adult literature	Fiction	J.K. Rowling/Mary GrandPrÃ©	4.49	439358078	9.780000e+12	eng	870	2153167	29221	9/1/2004	Scholastic Inc.
2	4	Harry Potter and the Chamber of Secrets	Harry Potter #2)	Speculative fiction	Fantasy	Fiction	J.K. Rowling	4.42	439554896	9.780000e+12	eng	352	6333	244	11/1/2003	Scholastic
3	5	Harry Potter and the Prisoner of Azkaban	Harry Potter #3)	Fantasy	Speculative fiction	Young adult literature	J.K. Rowling/Mary GrandPrÃ©	4.56	043965548X	9.780000e+12	eng	435	2339585	36325	5/1/2004	Scholastic Inc.
4	14	The Hitchhiker's Guide to the Galaxy	Hitchhiker's Guide to the Galaxy #1)	Science Fiction	Comic novel	Speculative fiction	Douglas Adams	4.22	1400052920	9.780000e+12	eng	215	4930	460	8/3/2004	Crown

df.shape

(2375, 16)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2375 entries, 0 to 2374
Data columns (total 16 columns):
bookID                2375 non-null int64
title                 2375 non-null object
title_desc            881 non-null object
Genre_1               2375 non-null object
Genre_2               1907 non-null object
Genre_3               1392 non-null object
authors               2375 non-null object
average_rating        2375 non-null float64
isbn                  2375 non-null object
isbn13                2375 non-null float64
language_code         2375 non-null object
num_pages             2375 non-null int64
ratings_count         2375 non-null int64
text_reviews_count    2375 non-null int64
publication_date      2375 non-null object
publisher             2375 non-null object
dtypes: float64(2), int64(4), object(10)
memory usage: 297.0+ KB

df.describe()

	bookID	average_rating	isbn13	num_pages	ratings_count	text_reviews_count
count	2375.000000	2375.000000	2.375000e+03	2375.000000	2.375000e+03	2375.000000
mean	19600.002526	3.920851	9.780067e+12	363.136421	5.384854e+04	1498.799158
std	13119.490184	0.238521	8.181855e+08	214.242202	2.101828e+05	4475.537133
min	1.000000	2.820000	9.780000e+12	0.000000	0.000000e+00	0.000000
25%	8250.500000	3.790000	9.780000e+12	231.000000	5.820000e+02	49.000000
50%	16729.000000	3.940000	9.780000e+12	336.000000	5.421000e+03	265.000000
75%	30245.500000	4.080000	9.780000e+12	449.500000	2.709350e+04	1049.000000
max	45572.000000	4.570000	9.790000e+12	1500.000000	4.597666e+06	94265.000000

features = list(df.columns)

list(df.columns) # List of all columns.

['bookID',
 'title',
 'title_desc',
 'Genre_1',
 'Genre_2',
 'Genre_3',
 'authors',
 'average_rating',
 'isbn',
 'isbn13',
 'language_code',
 'num_pages',
 'ratings_count',
 'text_reviews_count',
 'publication_date',
 'publisher']

numeric_features = list(df.describe().columns) # List of numeric columns.

list(df.describe().columns)

['bookID',
 'average_rating',
 'isbn13',
 'num_pages',
 'ratings_count',
 'text_reviews_count']

leftover_features = list(set(features)-set(numeric_features)) # Columns left after removing numeric features.

list(set(features)-set(numeric_features))

['Genre_3',
 'Genre_2',
 'language_code',
 'Genre_1',
 'title',
 'title_desc',
 'publisher',
 'authors',
 'isbn',
 'publication_date']

categorical_features = (df[leftover_features].nunique().loc[df[leftover_features].nunique()<150])._index.to_list()

# Here, we are taking features that have unique values less than 150 to be our categorical features for analysis.

df[leftover_features].nunique()

Genre_3               64
Genre_2               70
language_code          9
Genre_1               98
title               1865
title_desc           800
publisher            656
authors             1235
isbn                2375
publication_date    1262
dtype: int64

df[leftover_features].nunique().loc[df[leftover_features].nunique()<150]

Genre_3          64
Genre_2          70
language_code     9
Genre_1          98
dtype: int64

df[leftover_features].nunique().loc[df[leftover_features].nunique()<150]._index.to_list()

['Genre_3', 'Genre_2', 'language_code', 'Genre_1']

df.isnull().values.any() # Are there any null values?

True

(df.isnull().sum().sum()/np.product(df.shape))*100  # Percentage of null values.

7.75

df.isnull().sum()

bookID                   0
title                    0
title_desc            1494
Genre_1                  0
Genre_2                468
Genre_3                983
authors                  0
average_rating           0
isbn                     0
isbn13                   0
language_code            0
num_pages                0
ratings_count            0
text_reviews_count       0
publication_date         0
publisher                0
dtype: int64

df.isnull().sum().sum()

np.product(df.shape) # df.shape = (2375, 17)

Data Cleaning

df[df.duplicated()] # to check if there are any duplicate rows

	bookID	title	title_desc	Genre_1	Genre_2	Genre_3	authors	average_rating	isbn	isbn13	language_code	num_pages	ratings_count	text_reviews_count	publication_date	publisher

sorted(df['Genre_2'].unique()) # to check for any typographical errors or inconsistent CAPS.

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-23-a70f0ddb9afe> in <module>
----> 1 sorted(df['Genre_2'].unique()) # to check for any typographical errors or inconsistent CAPS.

TypeError: '<' not supported between instances of 'float' and 'str'

It shows an error because there is a ‘nan’ values in this column. We will replace that value to ‘Genre Not Mentioned’ for analysis.

df['Genre_2'].fillna('Genre Not Mentioned',inplace=True)
sorted(df['Genre_2'].unique())

['Absurdist fiction',
 'Adventure',
 'Adventure novel',
 'Alternate history',
 'Apocalyptic and post-apocalyptic fiction',
 'Autobiography',
 'Bangsian fantasy',
 'Bildungsroman',
 'Biography',
 "Children's literature",
 'Chivalric romance',
 'Comedy',
 'Comic novel',
 'Conspiracy fiction',
 'Crime Fiction',
 'Detective fiction',
 'Drama',
 'Dystopia',
 'Erotica',
 'Existentialism',
 'Fairytale fantasy',
 'Fantasy',
 'Feminist science fiction',
 'Fiction',
 'Genre Not Mentioned',
 'Gothic fiction',
 'Graphic novel',
 'Historical fiction',
 'Historical novel',
 'Historical whodunnit',
 'Horror',
 'Humour',
 'Inspirational',
 'Juvenile fantasy',
 'Künstlerroman',
 'Literary fiction',
 'Lost World',
 'Magic realism',
 'Mathematics',
 'Memoir',
 'Military science fiction',
 'Morality play',
 'Mystery',
 'Non-fiction',
 'Novel',
 'Novella',
 'Poetry',
 'Politics',
 'Postmodernism',
 'Psychological novel',
 'Psychology',
 'Reference',
 'Religious text',
 'Roman à clef',
 'Romance novel',
 'Satire',
 'Science',
 'Science Fiction',
 'Social criticism',
 'Speculative fiction',
 'Spy fiction',
 'Supernatural',
 'Suspense',
 'Techno-thriller',
 'Time travel',
 'Tragicomedy',
 'True crime',
 'Vampire fiction',
 'War novel',
 'Western',
 'Young adult literature']

df['Genre_3'].fillna('Genre Not Mentioned',inplace=True)
sorted(df['Genre_3'].unique())

['Absurdist fiction',
 'Adventure',
 'Adventure novel',
 'Anti-nuclear',
 'Apocalyptic and post-apocalyptic fiction',
 'Bildungsroman',
 'Biography',
 'Business',
 "Children's literature",
 'Comedy',
 'Comic fantasy',
 'Cozy',
 'Crime Fiction',
 'Detective fiction',
 'Dystopia',
 'Ergodic literature',
 'Erotica',
 'Fantasy',
 'Fiction',
 'Genre Not Mentioned',
 'High fantasy',
 'Historical fiction',
 'Historical novel',
 'Horror',
 'Humour',
 'Inspirational',
 'Künstlerroman',
 'Locked room mystery',
 'Magic realism',
 'Mathematics',
 'Memoir',
 'Military science fiction',
 'Mystery',
 'Nature',
 'Non-fiction',
 'Novel',
 'Novella',
 'Parallel novel',
 'Philosophy',
 'Picaresque novel',
 'Planetary romance',
 'Poetry',
 'Psychological novel',
 'Reference',
 'Religion',
 'Roman à clef',
 'Romance novel',
 'Satire',
 'Science',
 'Science Fiction',
 'Short story',
 'Social science fiction',
 'Social sciences',
 'Sociology',
 'Speculative fiction',
 'Spy fiction',
 'Steampunk',
 'Suspense',
 'Techno-thriller',
 'Travel literature',
 'Utopian and dystopian fiction',
 'War novel',
 'Western fiction',
 'Young adult literature',
 'Zombie']

sorted(df['language_code'].unique())

['en-CA', 'en-GB', 'en-US', 'eng', 'fre', 'ger', 'gla', 'mul', 'spa']

A bird’s-eye view of data!

df['Genre_1'].value_counts().nlargest(10) # Top 10 Genre based on count.

Science Fiction          368
Speculative fiction      332
Fiction                  284
Children's literature    214
Novel                    183
Thriller                 143
Crime Fiction            139
Fantasy                   99
Mystery                   76
Historical fiction        52
Name: Genre_1, dtype: int64

df.groupby('Genre_1')['average_rating'].mean().nlargest(10) # Top 10 Genre based on average rating.

Genre_1
True crime              4.430000
Adventure novel         4.428333
Religious text          4.340000
Self-help               4.300000
Science fantasy         4.240000
Conspiracy fiction      4.220000
Economics               4.180000
Prose poetry            4.145000
Historical fantasy      4.120000
Historical whodunnit    4.120000
Name: average_rating, dtype: float64

df['authors'].value_counts().nlargest(10) # Top 10 Authors based on count

Orson Scott Card       23
Agatha Christie        23
Stephen King           20
Dean Koontz            19
Mercedes Lackey        18
Laurell K. Hamilton    18
Terry Pratchett        18
Robert A. Heinlein     16
Anne Rice              15
Piers Anthony          15
Name: authors, dtype: int64

df['publisher'].value_counts().nlargest(10) # Top 10 Publishers based on count.

Penguin Books               86
Vintage                     86
Penguin Classics            57
Tor Books                   43
Modern Library              40
Ballantine Books            35
Grand Central Publishing    34
Berkley                     34
Bantam                      31
Pocket Books                29
Name: publisher, dtype: int64

df.groupby('title')['average_rating'].mean().nlargest(10) # Top 10 Books based on average rating.

title
Harry Potter and the Half-Blood Prince       4.570
Harry Potter and the Goblet of Fire          4.560
Harry Potter and the Prisoner of Azkaban     4.560
The Lord of the Rings                        4.500
The Return of the King                       4.494
Harry Potter and the Order of the Phoenix    4.490
Lonesome Dove                                4.490
Harry Potter and the Philosopher's Stone     4.470
A Breath of Snow and Ashes                   4.440
Found                                        4.440
Name: average_rating, dtype: float64

df.groupby('title')['ratings_count'].mean().nlargest(10) # Top 10 Books based on no. of ratings.

title
Twilight                                     4597666.0
The Catcher in the Rye                       2457092.0
Harry Potter and the Order of the Phoenix    2153167.0
Animal Farm                                  2111750.0
Of Mice and Men                              1755253.0
The Giver                                    1585589.0
Water for Elephants                          1260027.0
Harry Potter and the Prisoner of Azkaban     1171363.0
Harry Potter and the Chamber of Secrets      1150148.0
The Notebook                                 1090603.0
Name: ratings_count, dtype: float64

Exploratory Data Analysis

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_style('darkgrid')

Numeric Features Distribution

plt.figure(figsize = (12,6))
sns.pairplot(data=df)

<seaborn.axisgrid.PairGrid at 0x27dbf3af048>

<Figure size 864x432 with 0 Axes>

png

plt.figure(figsize = (12,6))
df['average_rating'].hist() # Distribution of average rating based on count.

<matplotlib.axes._subplots.AxesSubplot at 0x27dc0568080>

png

plt.figure(figsize = (12,6))
df['num_pages'].hist() # Distribution of number of pages based on count.

<matplotlib.axes._subplots.AxesSubplot at 0x27dc0136f28>

png

plt.figure(figsize = (12,6))
df['ratings_count'].hist() # Distribution of number of ratings based on count.

<matplotlib.axes._subplots.AxesSubplot at 0x27dc0f7a390>

png

plt.figure(figsize = (12,6))
df['text_reviews_count'].hist() # Distribution of number of text reviews based on count.

<matplotlib.axes._subplots.AxesSubplot at 0x27dc0f7a278>

png

plt.figure(figsize = (12,6))
sns.countplot(x='Genre_1',data=df)
plt.xticks(rotation=60,horizontalalignment='right')

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97]),
 <a list of 98 Text xticklabel objects>)

png

In the above graph we can see that there are a lot of sparse classes (values < 10), so we’ll group them together as OTHER and then view this graph again.

sparse_classes = df['Genre_1'].value_counts()[df['Genre_1'].value_counts() < 10]

df['Filtered_Genre_1'] = df['Genre_1'].apply(lambda x: "OTHER" if x in sparse_classes else x)
# new column to store values of Genre_1 after applying required changes.

plt.figure(figsize = (12,6))
sns.countplot(x='Filtered_Genre_1',data=df)
plt.xticks(rotation=60,horizontalalignment='right')

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24]),
 <a list of 25 Text xticklabel objects>)

png

Looks much better! So, the most number of books are ‘Science Fiction’ as Genre_1.

Let’s do the same for Genre_2 and Genre_3..

plt.figure(figsize = (12,6))
sns.countplot(x='Genre_2',data=df)
plt.xticks(rotation=60,horizontalalignment='right')

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70]), <a list of 71 Text xticklabel objects>)

png

sparse_classes = df['Genre_2'].value_counts()[df['Genre_2'].value_counts() < 10]

df['Filtered_Genre_2'] = df['Genre_2'].apply(lambda x: "OTHER" if x in sparse_classes else x)
# new column to store values of Genre_2 after applying required changes.

plt.figure(figsize = (12,6))
sns.countplot(x='Filtered_Genre_2',data=df)
plt.xticks(rotation=60,horizontalalignment='right')

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21]), <a list of 22 Text xticklabel objects>)

png

plt.figure(figsize = (12,6))
sns.countplot(x='Genre_3',data=df)
plt.xticks(rotation=60,horizontalalignment='right')

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]),
 <a list of 65 Text xticklabel objects>)

png

sparse_classes = df['Genre_3'].value_counts()[df['Genre_3'].value_counts() < 10]

df['Filtered_Genre_3'] = df['Genre_3'].apply(lambda x: "OTHER" if x in sparse_classes else x)
# new column to store values of Genre_3 after applying required changes.

plt.figure(figsize = (12,6))
sns.countplot(x='Filtered_Genre_3',data=df)
plt.xticks(rotation=60,horizontalalignment='right')

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17]), <a list of 18 Text xticklabel objects>)

png

plt.figure(figsize = (12,6))
sns.countplot(x='language_code',data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x27dc267d198>

png

Now, let’s create a new column ‘Group_genre’ with combined values from Genre_1, Genre_2 and Genre_3.

df['Group_genre'] = df['Filtered_Genre_1']+' / '+df['Filtered_Genre_2']+' / '+df['Filtered_Genre_3']

plt.figure(figsize = (12,6))
sns.countplot(x='Group_genre',data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x27dc2a35d30>

png

sparse_classes = df['Group_genre'].value_counts()[df['Group_genre'].value_counts() < 10]

df['Filtered_Group_genre'] = df['Group_genre'].apply(lambda x: "OTHER" if x in sparse_classes else x)
# new column to store values of Group_genre after applying required changes.

plt.figure(figsize = (12,6))
sns.countplot(x='Filtered_Group_genre',data=df)
plt.xticks(rotation=60,horizontalalignment='right')

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
 <a list of 50 Text xticklabel objects>)

png

It wasn’t very useful I guess, but I am just learning! :)

Let’s analyse the date field.

df['publication_date'] = pd.to_datetime(df['publication_date']) # date field coverted from str to datetime

df['year'] = df['publication_date'].apply(lambda x: x.year)
df['month'] = df['publication_date'].apply(lambda x: x.month)
# year and month derived from the date field.

plt.figure(figsize = (15,6))
sns.countplot(x='year',data=df)
plt.xticks(rotation=60,horizontalalignment='right')
# plot of number of books per year.

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58]),
 <a list of 59 Text xticklabel objects>)

png

byyear = df.groupby('year').count()
byyear.head()

	bookID	title	title_desc	Genre_1	Genre_2	Genre_3	authors	average_rating	isbn	isbn13	...	ratings_count	text_reviews_count	publication_date	publisher	Filtered_Genre_1	Filtered_Genre_2	Filtered_Genre_3	Group_genre	Filtered_Group_genre	month
year
1925	1	1	0	1	1	1	1	1	1	1	...	1	1	1	1	1	1	1	1	1	1
1940	1	1	0	1	1	1	1	1	1	1	...	1	1	1	1	1	1	1	1	1	1
1952	1	1	1	1	1	1	1	1	1	1	...	1	1	1	1	1	1	1	1	1	1
1953	1	1	0	1	1	1	1	1	1	1	...	1	1	1	1	1	1	1	1	1	1
1954	1	1	0	1	1	1	1	1	1	1	...	1	1	1	1	1	1	1	1	1	1

5 rows × 22 columns

plt.figure(figsize = (15,6))
byyear['bookID'].plot()
# continous distribution of number of books yearly.

<matplotlib.axes._subplots.AxesSubplot at 0x27dc37629b0>

png

plt.figure(figsize = (12,6))
byyear = df[df['year']==2005]
sns.countplot(x='month',data=byyear)
# Month-wise number of books of year 2005. Similarly we can see for other years.

<matplotlib.axes._subplots.AxesSubplot at 0x27dc3f86e48>

png

monthyear = df.groupby(by=['month','year']).count()['bookID'].unstack()
monthyear.fillna(0,inplace=True)
monthyear.head()

year	1925	1940	1952	1953	1954	1957	1959	1961	1962	1963	...	2007	2008	2009	2010	2012	2014	2015	2016	2017	2019
month
1	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	30.0	3.0	2.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	2.0	...	12.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	18.0	1.0	1.0	1.0	1.0	0.0	0.0	1.0	1.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	9.0	0.0	1.0	2.0	0.0	0.0	0.0	0.0	1.0	1.0
5	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	11.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0

5 rows × 59 columns

plt.figure(figsize = (12,6))
sns.heatmap(monthyear,cmap='coolwarm')
#This heatmap shows a relationship between the number of books per month year-wise.

<matplotlib.axes._subplots.AxesSubplot at 0x27dc3da43c8>

png

plt.figure(figsize = (12,6))
sns.boxplot(x='Filtered_Genre_1',y='average_rating',data=df)
plt.xticks(rotation=60,horizontalalignment='right')
#This shows a relationship between Genre and the average rating.

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24]),
 <a list of 25 Text xticklabel objects>)

png

plt.figure(figsize = (12,6))
sns.boxplot(x='Filtered_Genre_1',y='num_pages',data=df)
plt.xticks(rotation=60,horizontalalignment='right')
#This shows a relationship between Genre and the number of pages.

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24]),
 <a list of 25 Text xticklabel objects>)

png

plt.figure(figsize = (12,6))
sns.swarmplot(x='Filtered_Genre_1',y='text_reviews_count',data=df)
plt.xticks(rotation=60,horizontalalignment='right')
#This shows a relationship between Genre and the number of text reviews.

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24]),
 <a list of 25 Text xticklabel objects>)

png

plt.figure(figsize = (12,6))
sns.heatmap(df.corr(),cmap='viridis')
# This heatmap shows the dependencies existing between numeric features.

<matplotlib.axes._subplots.AxesSubplot at 0x27dc572f160>

png

This completes Exploratory Data Analysis. :)

Share on

Twitter Facebook Google+ LinkedIn

Vaishali Tanwar

Exploratory Data Analysis Project

Data Cleaning

A bird’s-eye view of data!

Exploratory Data Analysis

Numeric Features Distribution

Share on

You May Also Enjoy

Exploratory Data Analysis Project