Anand P V AdroitAnandAI

## pre.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                AdroitAnandAI
                / pre.ipynb
            
            
              Last active
              December 14, 2018 00:37
            
              
                crime rate 
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## load.py
# Load haberman.csv into a pandas dataFrame.
crimeData = pd.read_csv("crime_v2.csv")
# See the input data.
crimeData.head(5)

## analysis
# Check characteristics of data.
crimeData.info()

## analysis2.py
# To find stats of each feature.
crimeData.describe()
# This row has to be dropped because of data anomaly.
crimeData[crimeData['prbarr'] > 1]
# The location cannot be both west and central together.
crimeData[crimeData['west']+crimeData['central'] > 1]

## analysis1.py
# Check characteristics of data.
crimeData.info()

## analysis3.py
# To check how many zeros in each column
(crimeData==0).sum()

## univ.py
# Numerical distribution of dependant variable: Crime Rate
sns.distplot(crimeData['crmrte'], color='g', bins=100, hist_kws={'alpha': 0.4})

## univ2.py
crimeData.hist(figsize=(16, 20), bins=40, xlabelsize=8, ylabelsize=8);

## univ3.py
plt.figure(figsize=(25,25))
for idx, col in enumerate(crimeData.columns[2:]): #excluding county & year
    counts, bin_edges = np.histogram(crimeData[col], bins=10, density = True)
    pdf = counts/(sum(counts))
    cdf = np.cumsum(pdf)

    plt.subplot(7, 4, idx+1)
    plt.plot(bin_edges[1:],pdf, label='PDF')
    plt.plot(bin_edges[1:],cdf, label='CDF')
    plt.title(col + " CDF",fontsize=15)

## biv.py
# To plot the correlation of all features against crime rate
for i in range(0, len(crimeData.columns), 5):
    sns.pairplot(data=crimeData,
                x_vars=crimeData.columns[i:i+5],
                y_vars=['crmrte'])
	# Load haberman.csv into a pandas dataFrame.
	crimeData = pd.read_csv("crime_v2.csv")
	# See the input data.
	crimeData.head(5)
	# To find stats of each feature.
	crimeData.describe()
	# This row has to be dropped because of data anomaly.
	crimeData[crimeData['prbarr'] > 1]
	# The location cannot be both west and central together.
	crimeData[crimeData['west']+crimeData['central'] > 1]
	# To check how many zeros in each column
	(crimeData==0).sum()
	# Numerical distribution of dependant variable: Crime Rate
	sns.distplot(crimeData['crmrte'], color='g', bins=100, hist_kws={'alpha': 0.4})
	plt.figure(figsize=(25,25))
	for idx, col in enumerate(crimeData.columns[2:]): #excluding county & year
	counts, bin_edges = np.histogram(crimeData[col], bins=10, density = True)
	pdf = counts/(sum(counts))
	cdf = np.cumsum(pdf)

	plt.subplot(7, 4, idx+1)
	plt.plot(bin_edges[1:],pdf, label='PDF')
	plt.plot(bin_edges[1:],cdf, label='CDF')
	plt.title(col + " CDF",fontsize=15)
	# To plot the correlation of all features against crime rate
	for i in range(0, len(crimeData.columns), 5):
	sns.pairplot(data=crimeData,
	x_vars=crimeData.columns[i:i+5],
	y_vars=['crmrte'])