pandas
: Data manipulation and analysis.numpy
: Numerical computations.matplotlib
& seaborn
: Data visualization.scipy
& statsmodels
: Statistical analysis.scikit-learn
: Machine learning for analysis.```python import pandas as pd
data = pd.read_csv("data.csv") print(data.head()) Display first 5 rows ```
data.shape
(rows, columns).data.dtypes
.data.describe()
.
python
print(data.info()) Overview of columns
print(data['Age'].mean()) Calculate mean of Age column
python
data.dropna(inplace=True) Remove rows with missing data
data.fillna(0, inplace=True) Replace missing values with 0
python
data.rename(columns={"OldName": "NewName"}, inplace=True)
data.drop("UnwantedColumn", axis=1, inplace=True)
```python
filtered_data = data[data['Age'] > 30]
sorted_data = data.sort_values(by='Salary', ascending=False) ```
```python
grouped_data = data.groupby('Department')['Salary'].mean() print(grouped_data) ```
```python import matplotlib.pyplot as plt import seaborn as sns
data['Sales'].plot(kind='line')
sns.scatterplot(x='Age', y='Salary', data=data)
data['Salary'].plot(kind='hist', bins=10)
plt.show() ```
```python
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
sns.boxplot(x='Department', y='Salary', data=data) ```
data['column_name'].mean()
data['column_name'].median()
data['column_name'].std()
data.corr()
scipy
):
python
from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(data['X'], data['Y'])
print(f"Slope: {slope}, R-squared: {r_value**2}")
groupby
to analyze sales trends by region or product.
python
sales_by_region = data.groupby('Region')['Sales'].sum()
print(sales_by_region)
python
Q1 = data['Salary'].quantile(0.25)
Q3 = data['Salary'].quantile(0.75)
IQR = Q3 - Q1
outliers = data[(data['Salary'] < Q1 - 1.5 * IQR) | (data['Salary'] > Q3 + 1.5 * IQR)]
scikit-learn
.
```python
from sklearn.cluster import KMeanskmeans = KMeans(n_clusters=3) data['Cluster'] = kmeans.fit_predict(data[['Spending', 'Visits']]) ```
statsmodels
or pandas
.
python
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)
data['Sales'].plot()