VegZ is a comprehensive Python package for vegetation data analysis and environmental modeling. This manual provides complete, working examples using the correct API syntax.
pip install VegZ
For development version:
pip install git+https://github.com/mhatim99/VegZ.git
import pandas as pd
import numpy as np
from VegZ import quick_diversity_analysis, quick_ordination, quick_clustering, quick_elbow_analysis
# Create sample data
n_sites, n_species = 50, 20
data = pd.DataFrame(
np.random.exponential(2, (n_sites, n_species)),
columns=[f'Species_{i+1}' for i in range(n_species)]
)
# Quick diversity analysis
diversity_results = quick_diversity_analysis(data)
print("Quick diversity analysis completed")
print(f"Shape: {diversity_results.shape}")
# Quick ordination
ordination_results = quick_ordination(data, method='pca')
print("Quick PCA completed")
print(f"Available keys: {list(ordination_results.keys())}")
# Quick clustering
clustering_results = quick_clustering(data, n_clusters=3, method='kmeans')
print("Quick clustering completed")
# Quick elbow analysis
elbow_results = quick_elbow_analysis(data, max_k=10, plot_results=False)
print("Quick elbow analysis completed")
from VegZ import VegZ
import pandas as pd
import numpy as np
# Initialize VegZ
veg = VegZ()
# Create sample data
n_sites, n_species = 50, 20
data = pd.DataFrame(
np.random.exponential(2, (n_sites, n_species)),
columns=[f'Species_{i+1}' for i in range(n_species)]
)
# Assign data to VegZ instance
veg.data = data
veg.species_matrix = data
# Basic diversity calculation
diversity = veg.calculate_diversity(['shannon', 'simpson', 'richness'])
print(f"Diversity calculated for {diversity.shape[0]} sites")
# PCA analysis
pca_results = veg.pca_analysis(transform='hellinger')
print(f"PCA explained variance: {pca_results['explained_variance_ratio'][:2]}")
# NMDS analysis
nmds_results = veg.nmds_analysis(distance_metric='bray_curtis', n_dimensions=2)
print(f"NMDS stress: {nmds_results['stress']:.3f}")
# K-means clustering
kmeans_results = veg.kmeans_clustering(n_clusters=3, transform='hellinger')
print(f"K-means inertia: {kmeans_results['inertia']:.3f}")
# Hierarchical clustering
hier_results = veg.hierarchical_clustering(distance_metric='bray_curtis', linkage_method='average')
print("Hierarchical clustering completed")
# Rarefaction analysis
rarefaction = veg.rarefaction_curve()
print(f"Rarefaction curve shape: {rarefaction.shape}")
# Summary statistics
summary = veg.summary_statistics()
print(f"Data summary: {summary['n_sites']} sites, {summary['n_species']} species")
import matplotlib.pyplot as plt
# Plot diversity
fig = veg.plot_diversity(diversity, index_name='shannon')
plt.title('Shannon Diversity')
plt.show()
# Plot ordination
fig = veg.plot_ordination(pca_results)
plt.title('PCA Biplot')
plt.show()
# Plot species accumulation curve
fig = veg.plot_species_accumulation(rarefaction)
plt.title('Species Accumulation Curve')
plt.show()
# Plot cluster dendrogram
fig = veg.plot_cluster_dendrogram(hier_results)
plt.title('Cluster Dendrogram')
plt.show()
from VegZ import DiversityAnalyzer
diversity = DiversityAnalyzer()
# Calculate all diversity indices at once
all_indices = diversity.calculate_all_indices(data)
print("All diversity indices:")
print(f"Available indices: {list(all_indices.columns)}")
print(all_indices.head())
# Calculate individual indices
shannon = diversity.calculate_index(data, 'shannon')
simpson = diversity.calculate_index(data, 'simpson')
richness = diversity.calculate_index(data, 'richness')
evenness = diversity.calculate_index(data, 'evenness')
print(f"Shannon diversity range: {shannon.min():.3f} - {shannon.max():.3f}")
print(f"Simpson diversity range: {simpson.min():.3f} - {simpson.max():.3f}")
# Advanced diversity indices
fisher_alpha = diversity.calculate_index(data, 'fisher_alpha')
berger_parker = diversity.calculate_index(data, 'berger_parker')
print(f"Fisher's alpha range: {fisher_alpha.min():.3f} - {fisher_alpha.max():.3f}")
print(f"Berger-Parker range: {berger_parker.min():.3f} - {berger_parker.max():.3f}")
# Calculate Hill numbers (correct parameter name: q_values)
hill_numbers = diversity.hill_numbers(data, q_values=[0, 0.5, 1, 1.5, 2])
print("Hill numbers:")
print(f"Shape: {hill_numbers.shape}")
print(hill_numbers.head())
# Interpretation:
# q=0: Species richness (Hill 0)
# q=1: Shannon diversity exponential (Hill 1)
# q=2: Simpson diversity inverse (Hill 2)
# Whittaker's beta diversity (returns single value)
beta_whittaker = diversity.beta_diversity(data, method='whittaker')
print(f"Whittaker's beta diversity: {beta_whittaker:.3f}")
# Sørensen dissimilarity (returns distance matrix)
beta_sorensen = diversity.beta_diversity(data, method='sorensen')
print(f"Sørensen dissimilarity matrix shape: {beta_sorensen.shape}")
# Jaccard dissimilarity (returns distance matrix)
beta_jaccard = diversity.beta_diversity(data, method='jaccard')
print(f"Jaccard dissimilarity matrix shape: {beta_jaccard.shape}")
# Individual richness estimators
chao1 = diversity.chao1_estimator(data)
ace = diversity.ace_estimator(data, rare_threshold=10)
jack1 = diversity.jackknife1_estimator(data)
jack2 = diversity.jackknife2_estimator(data)
print("Richness estimators:")
print(f"Chao1 range: {chao1.min():.1f} - {chao1.max():.1f}")
print(f"ACE range: {ace.min():.1f} - {ace.max():.1f}")
print(f"Jackknife1 range: {jack1.min():.1f} - {jack1.max():.1f}")
print(f"Jackknife2 range: {jack2.min():.1f} - {jack2.max():.1f}")
from VegZ import MultivariateAnalyzer
import numpy as np
multivar = MultivariateAnalyzer()
# PCA Analysis
pca_results = multivar.pca_analysis(data, transform='hellinger')
print("PCA Analysis:")
print(f"Explained variance ratio: {pca_results['explained_variance_ratio'][:3]}")
cumulative_variance = np.cumsum(pca_results['explained_variance_ratio'])
print(f"Cumulative variance: {cumulative_variance[:3]}")
# NMDS Analysis
nmds_results = multivar.nmds_analysis(
data,
distance_metric='bray_curtis',
n_dimensions=2,
max_iterations=300
)
print(f"NMDS stress: {nmds_results['stress']:.4f}")
print(f"Converged: {nmds_results['converged']}")
# Correspondence Analysis (CA)
ca_results = multivar.correspondence_analysis(data, scaling=1)
print("CA Analysis completed")
print(f"Available keys: {list(ca_results.keys())}")
# Detrended Correspondence Analysis (DCA)
dca_results = multivar.detrended_correspondence_analysis(data, segments=26)
print("DCA Analysis completed")
print(f"Gradient lengths: {dca_results['gradient_lengths']}")
# Create environmental data
env_data = pd.DataFrame({
'Temperature': np.random.normal(15, 5, n_sites),
'Precipitation': np.random.exponential(100, n_sites),
'pH': np.random.uniform(4.5, 8.0, n_sites),
'Elevation': np.random.uniform(100, 2000, n_sites)
})
# Canonical Correspondence Analysis (CCA)
cca_results = multivar.canonical_correspondence_analysis(
species_data=data,
env_data=env_data,
scaling=1
)
print("CCA Analysis completed")
print(f"Eigenvalues: {cca_results['eigenvalues'][:3]}")
print(f"Species-environment correlations: {cca_results['species_env_correlation'][:3]}")
# Alternative abbreviated method name (same function)
cca_results2 = multivar.cca_analysis(species_data=data, env_data=env_data)
print("CCA using abbreviated method completed")
# Environmental vector fitting
env_fit = multivar.environmental_fitting(
ordination_scores=pca_results['site_scores'].iloc[:, :2], # First 2 PC axes
env_data=env_data,
method='vector'
)
print("Environmental vector fitting:")
print("Significant vectors (p < 0.05):")
for env_var, p_value in env_fit['p_values'].items():
if p_value < 0.05:
r2_value = env_fit['r_squared'][env_var]
print(f" {env_var}: R² = {r2_value:.3f}, p = {p_value:.3f}")
# Test ordination quality
gof_results = multivar.goodness_of_fit_test(
ordination_results=pca_results,
original_data=data,
distance_metric='bray_curtis'
)
print("Goodness of fit:")
print(f"Correlation: {gof_results['correlation']:.3f}")
print(f"Stress: {gof_results['stress']:.3f}")
from VegZ import VegetationClustering
clustering = VegetationClustering()
# K-means clustering
kmeans_results = clustering.kmeans_clustering(data, n_clusters=4, n_init=10)
print("K-means clustering:")
print(f"Silhouette score: {kmeans_results['silhouette_score']:.3f}")
print(f"Calinski-Harabasz score: {kmeans_results['calinski_harabasz_score']:.1f}")
# Hierarchical clustering
hierarchical_results = clustering.hierarchical_clustering(
data,
method='ward',
metric='euclidean',
n_clusters=4
)
print(f"Hierarchical clustering completed with {hierarchical_results['n_clusters']} clusters")
# Fuzzy C-means clustering (correct parameter name: fuzziness)
fuzzy_results = clustering.fuzzy_cmeans_clustering(
data,
n_clusters=4,
fuzziness=2.0, # Note: correct parameter name
max_iter=100
)
print("Fuzzy C-means clustering completed")
print(f"Fuzzy partition coefficient: {fuzzy_results['partition_coefficient']:.3f}")
# DBSCAN clustering
dbscan_results = clustering.dbscan_clustering(
data,
eps=0.5,
min_samples=5,
distance_metric='euclidean'
)
print(f"DBSCAN found {dbscan_results['n_clusters']} clusters")
print(f"Number of noise points: {dbscan_results['n_noise']}")
# Gaussian Mixture clustering
gmm_results = clustering.gaussian_mixture_clustering(
data,
n_components=4,
covariance_type='full'
)
print(f"GMM AIC: {gmm_results['aic']:.1f}")
print(f"GMM BIC: {gmm_results['bic']:.1f}")
# Comprehensive elbow analysis
elbow_results = clustering.comprehensive_elbow_analysis(
data,
k_range=range(2, 11),
methods=['knee_locator', 'derivative', 'variance_explained'],
transform='hellinger',
plot_results=False
)
print("Elbow analysis results:")
print(f"Recommended k values: {elbow_results['recommendations']}")
# Optimal clusters using multiple criteria
optimal_results = clustering.optimal_clusters_analysis(
data,
k_range=range(2, 11),
methods=['silhouette', 'gap_statistic']
)
print("Optimal cluster analysis:")
print(f"Best k by silhouette: {optimal_results['silhouette']['best_k']}")
print(f"Best k by gap statistic: {optimal_results['gap_statistic']['best_k']}")
from VegZ import EcologicalStatistics
from scipy.spatial.distance import pdist, squareform
stats = EcologicalStatistics()
# Create distance matrix for tests that require it
distances = pdist(data, metric='braycurtis')
distance_matrix = squareform(distances)
# Create grouping variable
groups = kmeans_results['cluster_labels']
# PERMANOVA (requires distance matrix)
permanova = stats.permanova(
distance_matrix=distance_matrix,
groups=groups,
permutations=999
)
print("PERMANOVA results:")
print(f"F-statistic: {permanova['f_statistic']:.3f}")
print(f"p-value: {permanova['p_value']:.3f}")
print(f"R-squared: {permanova['r_squared']:.3f}")
# ANOSIM
anosim = stats.anosim(
distance_matrix=distance_matrix,
groups=groups,
permutations=999
)
print("ANOSIM results:")
print(f"R statistic: {anosim['r_statistic']:.3f}")
print(f"p-value: {anosim['p_value']:.3f}")
# MRPP (requires distance matrix)
mrpp = stats.mrpp(
distance_matrix=distance_matrix,
groups=groups,
permutations=999
)
print("MRPP results:")
print(f"A statistic: {mrpp['a_statistic']:.3f}")
print(f"p-value: {mrpp['p_value']:.3f}")
# Create second distance matrix for Mantel test
env_distances = pdist(env_data, metric='euclidean')
env_distance_matrix = squareform(env_distances)
# Mantel test
mantel = stats.mantel_test(
matrix1=distance_matrix,
matrix2=env_distance_matrix,
permutations=999,
method='pearson'
)
print("Mantel test results:")
print(f"Correlation: {mantel['correlation']:.3f}")
print(f"p-value: {mantel['p_value']:.3f}")
# Partial Mantel test (requires 3 matrices)
spatial_coords = env_data[['Temperature', 'pH']].values # Use as spatial proxy
spatial_distances = pdist(spatial_coords, metric='euclidean')
spatial_distance_matrix = squareform(spatial_distances)
partial_mantel = stats.partial_mantel_test(
matrix1=distance_matrix,
matrix2=env_distance_matrix,
matrix3=spatial_distance_matrix,
permutations=999
)
print("Partial Mantel test results:")
print(f"Partial correlation: {partial_mantel['partial_correlation']:.3f}")
print(f"p-value: {partial_mantel['p_value']:.3f}")
# Indicator species analysis
indicator = stats.indicator_species_analysis(
species_data=data,
groups=groups,
permutations=999
)
print("Indicator Species Analysis:")
print("Top indicator species:")
top_indicators = indicator['species_stats'].nlargest(5, 'indicator_value')
print(top_indicators[['indicator_value', 'p_value']])
# SIMPER analysis
simper = stats.simper_analysis(
species_data=data,
groups=groups,
distance_metric='bray_curtis'
)
print("SIMPER analysis completed")
print(f"Average dissimilarity: {simper['average_dissimilarity']:.3f}")
from VegZ import TemporalAnalyzer
temporal = TemporalAnalyzer()
# Create temporal data
temporal_data = pd.DataFrame({
'date': pd.date_range('2015-01-01', periods=100, freq='W'),
'abundance': np.random.exponential(2, 100) + np.sin(np.arange(100) * 0.1) * 5,
'species': 'Species_1'
})
# Trend detection (correct parameter names)
trend_results = temporal.trend_detection(
data=temporal_data,
time_col='date',
response_col='abundance',
method='mann_kendall'
)
print("Trend Detection:")
print(f"Trend: {trend_results['trend']}")
print(f"p-value: {trend_results['p_value']:.4f}")
print(f"Sen's slope: {trend_results['sens_slope']:.3f}")
# Phenology modeling (correct parameter names)
phenology_results = temporal.phenology_modeling(
data=temporal_data,
time_col='date',
response_col='abundance',
model_type='sigmoid'
)
print("Phenology modeling:")
print(f"Model type: {phenology_results['model_type']}")
model_fit = phenology_results['results']['combined']
print(f"Model success: {model_fit['success']}")
print(f"R-squared: {model_fit['r_squared']:.3f}")
# Seasonal decomposition
seasonal_results = temporal.seasonal_decomposition(
data=temporal_data,
time_col='date',
response_col='abundance',
method='classical',
period=52 # Weekly data, annual period
)
print("Seasonal decomposition:")
print(f"Method: {seasonal_results['method']}")
print(f"Period: {seasonal_results['period']}")
print(f"Trend component shape: {seasonal_results['trend'].shape}")
# Growth curve fitting
growth_data = pd.DataFrame({
'time': np.arange(50),
'size': np.random.exponential(1, 50) * np.arange(1, 51) * 0.5,
'species': 'Species_A'
})
growth_results = temporal.growth_curve_fitting(
data=growth_data,
time_col='time',
size_col='size',
curve_type='logistic'
)
print("Growth curve fitting:")
growth_fit = growth_results['results']['combined']
print(f"R-squared: {growth_fit['r_squared']:.3f}")
print(f"Growth parameters: {growth_fit['growth_parameters']}")
from VegZ import SpatialAnalyzer
spatial = SpatialAnalyzer()
# Create spatial data (correct format)
spatial_data = pd.DataFrame({
'longitude': np.random.uniform(-120, -100, 50),
'latitude': np.random.uniform(30, 45, 50),
'response': np.random.exponential(2, 50)
})
# Spatial interpolation (correct parameter names)
idw_results = spatial.spatial_interpolation(
data=spatial_data,
x_col='longitude',
y_col='latitude',
z_col='response',
method='idw',
grid_resolution=0.1
)
print("IDW interpolation completed")
print(f"Grid shape: {idw_results['Z_grid'].shape}")
# Kriging interpolation
kriging_results = spatial.spatial_interpolation(
data=spatial_data,
x_col='longitude',
y_col='latitude',
z_col='response',
method='kriging',
grid_resolution=0.1
)
print("Kriging interpolation completed")
print(f"Available results: {list(kriging_results.keys())}")
# Spatial autocorrelation (correct parameter names)
morans_i = spatial.spatial_autocorrelation(
data=spatial_data,
x_col='longitude',
y_col='latitude',
response_col='response',
method='morans_i'
)
print("Spatial autocorrelation:")
print(f"Moran's I: {morans_i['morans_i']:.4f}")
print(f"p-value: {morans_i['p_value']:.4f}")
print(f"Expected I: {morans_i['expected_i']:.4f}")
# Habitat suitability modeling
presence_data = pd.DataFrame({
'longitude': np.random.uniform(-120, -100, 100),
'latitude': np.random.uniform(30, 45, 100),
'presence': np.random.choice([0, 1], 100, p=[0.7, 0.3]),
'temperature': np.random.normal(15, 5, 100),
'precipitation': np.random.exponential(100, 100)
})
habitat_results = spatial.habitat_suitability_modeling(
presence_data=presence_data,
environmental_data=presence_data, # Same dataframe with env variables
x_col='longitude',
y_col='latitude',
response_col='presence',
method='random_forest'
)
print("Habitat suitability modeling completed")
print(f"Model performance: {habitat_results['model_performance']}")
from VegZ import EnvironmentalModeler
env_model = EnvironmentalModeler()
# Species response curves
species_response = env_model.species_response_curves(
species_data=data.iloc[:, 0], # First species
environmental_var=env_data['Temperature'],
curve_type='gaussian'
)
print("Species response curves:")
print(f"Optimum temperature: {species_response['optimum']:.2f}")
print(f"Tolerance: {species_response['tolerance']:.2f}")
# GAM fitting
gam_results = env_model.fit_gam(
data=pd.concat([data.iloc[:, :5], env_data], axis=1),
response_col='Species_1',
predictor_cols=['Temperature', 'pH', 'Precipitation'],
family='gaussian'
)
print("GAM results:")
print(f"R-squared: {gam_results['r_squared']:.3f}")
print(f"AIC: {gam_results['aic']:.1f}")
# Environmental gradient analysis
gradient_results = env_model.environmental_gradient_analysis(
species_data=data,
env_data=env_data,
method='cca'
)
print("Environmental gradient analysis completed")
print(f"Constrained variance: {gradient_results['constrained_variance']:.3f}")
from VegZ import FunctionalTraitAnalyzer
traits_analyzer = FunctionalTraitAnalyzer()
# Create trait data
trait_data = pd.DataFrame({
'SLA': np.random.normal(20, 5, n_species),
'Height': np.random.lognormal(1, 0.5, n_species),
'SeedMass': np.random.lognormal(0, 1, n_species)
}, index=[f'Species_{i+1}' for i in range(n_species)])
# Load trait data into analyzer (required step)
traits_analyzer.load_trait_data(trait_data, abundance_data=data)
# Calculate functional diversity
func_diversity = traits_analyzer.calculate_functional_diversity(
traits=['SLA', 'Height', 'SeedMass'],
standardize=True
)
print("Functional diversity:")
print(f"Available indices: {func_diversity['site_diversity'].columns.tolist()}")
print(f"Mean functional richness: {func_diversity['site_diversity']['FRic'].mean():.3f}")
# Calculate functional beta diversity
func_beta = traits_analyzer.calculate_functional_beta_diversity(
traits=['SLA', 'Height', 'SeedMass']
)
print("Functional beta diversity:")
print(f"Gamma diversity: {func_beta['gamma_diversity']:.3f}")
print(f"Mean alpha diversity: {func_beta['mean_alpha_diversity']:.3f}")
print(f"Beta diversity: {func_beta['beta_diversity']:.3f}")
# Identify functional groups
func_groups = traits_analyzer.identify_functional_groups(
n_groups=4,
traits=['SLA', 'Height', 'SeedMass'],
method='hierarchical'
)
print("Functional groups:")
print(f"Number of groups: {func_groups['n_groups']}")
print("Group characteristics available")
# Trait-environment relationships
trait_env = traits_analyzer.trait_environment_relationships(
environmental_data=env_data
)
print("Trait-environment relationships:")
print(f"Correlations shape: {trait_env['correlations'].shape}")
print(f"Significant correlations: {len(trait_env['significant_correlations'])}")
from VegZ import MachineLearningAnalyzer
ml = MachineLearningAnalyzer()
# Prepare ML data
ml_data = pd.concat([data.iloc[:, :5], env_data], axis=1)
ml_data['biomass'] = np.random.exponential(50, n_sites)
# Biomass prediction
biomass_results = ml.biomass_prediction(
data=ml_data,
biomass_column='biomass',
predictor_features=['Species_1', 'Species_2', 'Temperature', 'pH'],
model_type='rf',
optimize_hyperparameters=False
)
print("Biomass Prediction Results:")
print(f"Model performance: {biomass_results['performance']}")
print("Feature importance:")
for feature, importance in zip(biomass_results['feature_names'], biomass_results['feature_importance']):
print(f" {feature}: {importance:.3f}")
# Community classification
community_results = ml.community_classification(
data=ml_data,
species_columns=[f'Species_{i+1}' for i in range(5)],
n_communities=3,
method='kmeans'
)
print("Community Classification:")
print(f"Number of communities: {community_results['n_communities']}")
print(f"Cluster centers shape: {community_results['cluster_centers'].shape}")
# Species identification
ml_data['leaf_length'] = np.random.normal(5, 1, n_sites)
ml_data['leaf_width'] = np.random.normal(2, 0.5, n_sites)
species_labels = np.random.choice(['Species_A', 'Species_B', 'Species_C'], n_sites)
ml_data['species_label'] = species_labels
identification_results = ml.species_identification(
data=ml_data,
morphological_features=['leaf_length', 'leaf_width'],
species_column='species_label',
test_size=0.3
)
print("Species Identification:")
print(f"Best model: {identification_results['best_model']}")
print("Model performance:")
for model, performance in identification_results['performance'].items():
print(f" {model}: accuracy = {performance.get('accuracy', 'N/A')}")
# Habitat suitability modeling
ml_data['presence'] = np.random.choice([0, 1], n_sites, p=[0.6, 0.4])
habitat_results = ml.habitat_suitability_modeling(
data=ml_data,
species_column='presence',
environmental_features=['Temperature', 'pH', 'Precipitation'],
model_type='rf',
cross_validation=True
)
print("Habitat Suitability Modeling:")
print(f"Cross-validation score: {habitat_results['cv_scores'].mean():.3f}")
# Dimensionality reduction
dim_reduction = ml.dimensionality_reduction(
data=ml_data,
feature_columns=[f'Species_{i+1}' for i in range(5)],
method='pca',
n_components=2
)
print("Dimensionality Reduction:")
print(f"Explained variance ratio: {dim_reduction['explained_variance_ratio']}")
# Anomaly detection
anomaly_results = ml.ecological_anomaly_detection(
data=ml_data,
feature_columns=[f'Species_{i+1}' for i in range(5)],
contamination=0.1,
method='isolation_forest'
)
print("Anomaly Detection:")
print(f"Number of anomalies detected: {anomaly_results['n_anomalies']}")
from VegZ.data_quality import SpatialValidator
# Initialize spatial validator
spatial_val = SpatialValidator()
# Create coordinate data
coords_df = pd.DataFrame({
'latitude': np.random.uniform(30, 45, 100),
'longitude': np.random.uniform(-120, -100, 100)
})
# Validate coordinates
coord_validation = spatial_val.validate_coordinates(
df=coords_df,
lat_col='latitude',
lon_col='longitude'
)
print("Coordinate validation results:")
print(f"Total records: {coord_validation['total_records']}")
print(f"Valid coordinates: {coord_validation['valid_coordinates']}")
print(f"Issues found: {len(coord_validation['issues_found'])}")
# Detect geographic outliers
outliers = spatial_val.detect_geographic_outliers(
df=coords_df,
lat_col='latitude',
lon_col='longitude',
method='iqr',
threshold=1.5
)
print(f"Geographic outliers detected: {outliers.sum()}")
# Generate spatial quality report
spatial_report = spatial_val.generate_spatial_quality_report(
df=coords_df,
lat_col='latitude',
lon_col='longitude'
)
print("Spatial quality report generated")
print(f"Report keys: {list(spatial_report.keys())}")
from VegZ.data_quality import TemporalValidator
# Initialize temporal validator
temp_val = TemporalValidator()
# Create temporal data
temp_df = pd.DataFrame({
'date': ['2020-01-15', '2020-02-20', '2020-13-05', '2020-05-30', 'invalid-date'],
'collection_date': ['2020-01-15', '2020-02-20', '2020-03-05', '2020-05-30', '2020-06-15']
})
# Validate dates
date_validation = temp_val.validate_dates(
df=temp_df,
date_cols='date',
event_date_col='collection_date'
)
print("Temporal validation results:")
print(f"Total records: {date_validation['total_records']}")
print(f"Valid dates: {date_validation['valid_dates']}")
print(f"Issues found: {len(date_validation['issues_found'])}")
print(f"Date columns analyzed: {date_validation['date_columns_analyzed']}")
# Generate temporal quality report
temp_report = temp_val.generate_temporal_quality_report(
df=temp_df,
date_cols='date',
event_date_col='collection_date'
)
print("Temporal quality report generated")
print(f"Report keys: {list(temp_report.keys())}")
from VegZ import InteractiveVisualizer
interactive = InteractiveVisualizer()
# Create diversity dashboard
diversity_dashboard = interactive.create_diversity_dashboard(
diversity_results={'results': diversity, 'indices': ['shannon', 'simpson', 'richness']},
data=data
)
print("Diversity dashboard created")
print(f"Dashboard components: {list(diversity_dashboard.keys())}")
# Create ordination dashboard
ordination_dashboard = interactive.create_ordination_dashboard(
ordination_results=pca_results,
environmental_data=env_data
)
print("Ordination dashboard created")
print(f"Dashboard components: {list(ordination_dashboard.keys())}")
# Create clustering dashboard
clustering_dashboard = interactive.create_clustering_dashboard(
clustering_results=kmeans_results,
ordination_results=pca_results
)
print("Clustering dashboard created")
print(f"Dashboard components: {list(clustering_dashboard.keys())}")
# Create trait dashboard (if trait data loaded)
if 'trait_data' in locals():
trait_dashboard = interactive.create_trait_dashboard(
trait_results=func_diversity,
trait_data=trait_data
)
print("Trait dashboard created")
# Save dashboard
dashboard_file = interactive.save_dashboard(
dashboard=diversity_dashboard,
filename='diversity_dashboard.html',
format='html'
)
print(f"Dashboard saved to: {dashboard_file}")
from VegZ import ReportGenerator
report_gen = ReportGenerator()
# Prepare analysis results (use summary data)
analysis_results = {
'diversity_summary': {
'mean_shannon': diversity['shannon'].mean(),
'mean_simpson': diversity['simpson'].mean(),
'total_species': data.shape[1],
'total_sites': data.shape[0]
},
'ordination_summary': {
'method': 'PCA',
'variance_explained': pca_results['explained_variance_ratio'][:2].tolist()
},
'clustering_summary': {
'method': 'K-means',
'n_clusters': kmeans_results['n_clusters'],
'silhouette_score': kmeans_results['silhouette_score']
}
}
# Generate HTML report
report_content = report_gen.generate_analysis_report(
results=analysis_results,
output_format='html'
)
# Save report to file
output_file = report_gen.save_report(
report_content=report_content,
filename='vegetation_analysis_report.html',
format='html'
)
print("Report generated successfully!")
print(f"Report saved to: {output_file}")
print(f"Report length: {len(report_content)} characters")
from VegZ.data_management.standardization import SpeciesNameStandardizer
# Initialize standardizer
standardizer = SpeciesNameStandardizer()
# Validate individual species names
result = standardizer.validate_species_name("Quercus alba L.")
print(f"Valid: {result['is_valid']}")
print(f"Errors: {result['errors']}")
print(f"Cleaned name: '{result['cleaned_name']}'")
# Test various name formats
test_names = [
"Quercus alba", # Valid binomial
"Quercus", # Genus only
"quercus alba", # Capitalization error
"Quercus alba L.", # Author citation
"Quercus × alba", # Hybrid marker
"Quercus sp.", # Placeholder
"Quercus alba!", # Invalid character
]
print("\nIndividual name validation:")
for name in test_names:
result = standardizer.validate_species_name(name)
print(f"'{name}': Valid={result['is_valid']}, Errors={result['error_count']}")
# Batch validation
batch_results = standardizer.batch_validate_names(test_names)
print(f"\nBatch validation results:")
print(f"Shape: {batch_results.shape}")
print(f"Valid names: {batch_results['is_valid'].sum()}/{len(batch_results)}")
# Error distribution
error_columns = [col for col in batch_results.columns if col.startswith('has_')]
print("\nError distribution:")
for col in error_columns:
error_count = batch_results[col].sum()
error_type = col.replace('has_', '')
print(f" {error_type}: {error_count} names")
# DataFrame standardization
vegetation_df = pd.DataFrame({
'site_id': ['site_001', 'site_002', 'site_003'],
'species': ['Quercus alba', 'quercus sp.', 'Pinus strobus L.'],
'abundance': [25, 12, 18]
})
# Standardize with full error detection
enhanced_df = standardizer.standardize_dataframe(
df=vegetation_df,
species_column='species'
)
print(f"\nDataFrame standardization:")
print(f"Original columns: {list(vegetation_df.columns)}")
print(f"Enhanced columns: {len(enhanced_df.columns)} total")
print("New validation columns added for quality assessment")
# Fuzzy matching
reference_species = ["Quercus alba", "Pinus strobus", "Acer saccharum"]
query_species = ["Quercus albus", "Pinus strobus", "Acer sacchrum"]
matches = standardizer.fuzzy_match_species(
query_species=query_species,
reference_species=reference_species,
threshold=80
)
print(f"\nFuzzy matching results:")
for query, match in matches.items():
print(f" '{query}' -> '{match}'")
The TaxonomicResolver enables validation and standardization of plant species names against authoritative online taxonomic databases.
from VegZ import TaxonomicResolver, resolve_species_names
# Initialize with default source (World Flora Online)
resolver = TaxonomicResolver()
# Or specify sources
resolver = TaxonomicResolver(sources='gbif') # Single source
resolver = TaxonomicResolver(sources=['wfo', 'powo', 'gbif'], use_fallback=True) # Multiple with fallback
# Resolve a list of species names
species_list = ['Quercus robur', 'Pinus sylvestris', 'Betula pendula']
results = resolver.resolve_names(species_list)
print("Resolution results:")
print(results[['original_name', 'accepted_name', 'match_score', 'family', 'source']])
# Resolve species names directly from a file
results = resolver.resolve_from_file('species_list.csv')
results = resolver.resolve_from_file('data.xlsx', species_column='ScientificName')
# Export results to various formats
resolver.export_results(results, 'resolved_names.csv')
resolver.export_results(results, 'resolved_names.xlsx')
resolver.export_results(results, 'resolved_names.json')
import pandas as pd
# Load your vegetation data
df = pd.read_csv('vegetation_survey.csv')
# Resolve and update species names in the DataFrame
updated_df = resolver.resolve_dataframe(
df,
species_column='species',
update_names=True, # Replace with accepted names
add_taxonomy_columns=True, # Add family, genus, match_score columns
min_score_threshold=70 # Only update if score >= 70
)
# Original names are preserved in 'species_original' column
print(updated_df[['species_original', 'species', 'taxon_family', 'taxon_match_score']])
# Get resolution summary
resolver.print_summary(results)
# Output:
# ============================================================
# TAXONOMIC RESOLUTION SUMMARY
# ============================================================
# Total names processed: 10
# Successfully resolved: 9 (90.0%)
# Unresolved: 1
# ------------------------------------------------------------
# High confidence (>=90): 7
# Medium confidence (70-89): 2
# Low confidence (<70): 0
# Average match score: 92.5
# ------------------------------------------------------------
# Sources used: GBIF
# Unique families found: 5
# ============================================================
from VegZ.data_management import resolve_species_names, resolve_species_from_file, update_species_in_dataframe
# Quick resolution
results = resolve_species_names(['Quercus robur', 'Pinus sylvestris'], sources='gbif')
# Resolve from file with automatic export
results = resolve_species_from_file('species.csv', output_file='resolved.xlsx')
# Update DataFrame in one line
df_updated = update_species_in_dataframe(df, sources='gbif', min_score=70)
The resolution results include:
| Column | Description |
|---|---|
| original_name | Input species name |
| accepted_name | Resolved accepted name |
| accepted_author | Author citation |
| match_score | Confidence (0-100) |
| match_type | exact, fuzzy, candidate, synonym |
| taxonomic_status | accepted, synonym, unresolved |
| synonyms | Known synonyms from database |
| family | Taxonomic family |
| genus | Genus name |
| source | Database used (WFO, POWO, etc.) |
| source_id | Record ID in source database |
| source_url | Direct link to record |
def complete_vegetation_analysis(data, env_data=None):
"""
Complete vegetation analysis workflow with all major components.
"""
results = {}
# Step 1: Data Quality Check
print("Step 1: Data quality assessment...")
quality_stats = {
'n_sites': data.shape[0],
'n_species': data.shape[1],
'completeness': (data > 0).sum().sum() / (data.shape[0] * data.shape[1]),
'zero_sites': (data.sum(axis=1) == 0).sum(),
'zero_species': (data.sum(axis=0) == 0).sum()
}
results['data_quality'] = quality_stats
print(f" Data quality: {quality_stats['completeness']:.2%} completeness")
# Step 2: Diversity Analysis
print("Step 2: Diversity analysis...")
veg = VegZ()
veg.data = data
veg.species_matrix = data
diversity = veg.calculate_diversity(['shannon', 'simpson', 'richness'])
results['diversity'] = diversity
print(f" Diversity calculated for {diversity.shape[0]} sites")
# Step 3: Multivariate Analysis
print("Step 3: Multivariate analysis...")
pca_results = veg.pca_analysis(transform='hellinger')
nmds_results = veg.nmds_analysis(distance_metric='bray_curtis')
results['ordination'] = {
'pca': pca_results,
'nmds': nmds_results
}
print(f" PCA explained variance: {pca_results['explained_variance_ratio'][:2]}")
print(f" NMDS stress: {nmds_results['stress']:.3f}")
# Step 4: Clustering Analysis
print("Step 4: Clustering analysis...")
clustering = VegetationClustering()
# Find optimal number of clusters
elbow_results = clustering.comprehensive_elbow_analysis(data, plot_results=False)
optimal_k = elbow_results.get('recommendations', {}).get('consensus', 3)
kmeans_results = clustering.kmeans_clustering(data, n_clusters=optimal_k)
results['clustering'] = kmeans_results
print(f" Clustering completed with k={optimal_k}")
print(f" Silhouette score: {kmeans_results['silhouette_score']:.3f}")
# Step 5: Statistical Tests
if env_data is not None:
print("Step 5: Statistical analysis...")
stats = EcologicalStatistics()
groups = kmeans_results['cluster_labels']
# Calculate distance matrix for PERMANOVA
distances = pdist(data, metric='braycurtis')
distance_matrix = squareform(distances)
permanova = stats.permanova(
distance_matrix=distance_matrix,
groups=groups,
permutations=199 # Reduced for speed
)
results['statistics'] = permanova
print(f" PERMANOVA: F={permanova['f_statistic']:.3f}, p={permanova['p_value']:.3f}")
# Step 6: Generate Report
print("Step 6: Generating report...")
report_gen = ReportGenerator()
summary_results = {
'data_summary': quality_stats,
'diversity_summary': {
'mean_shannon': diversity['shannon'].mean(),
'mean_simpson': diversity['simpson'].mean(),
'mean_richness': diversity['richness'].mean()
},
'clustering_summary': {
'method': 'K-means',
'n_clusters': optimal_k,
'silhouette_score': kmeans_results['silhouette_score']
}
}
if env_data is not None:
summary_results['statistics_summary'] = {
'permanova_f': permanova['f_statistic'],
'permanova_p': permanova['p_value']
}
report_content = report_gen.generate_analysis_report(
results=summary_results,
output_format='html'
)
output_file = report_gen.save_report(
report_content=report_content,
filename='complete_analysis_report.html',
format='html'
)
results['report_file'] = output_file
print(f" Report saved to: {output_file}")
return results
# Run complete analysis
print("=== COMPLETE VEGETATION ANALYSIS WORKFLOW ===")
complete_results = complete_vegetation_analysis(data, env_data)
print(f"\n=== ANALYSIS COMPLETE ===")
print(f"Results include: {list(complete_results.keys())}")
print("All analyses completed successfully with verified syntax!")
# For large datasets
def handle_large_dataset(data, chunk_size=1000):
"""Handle large datasets efficiently."""
if len(data) > chunk_size:
print(f"Processing large dataset ({len(data)} sites) in chunks...")
# Process diversity in chunks for memory efficiency
diversity_chunks = []
for i in range(0, len(data), chunk_size):
chunk = data.iloc[i:i+chunk_size]
chunk_diversity = quick_diversity_analysis(chunk)
diversity_chunks.append(chunk_diversity)
# Combine results
combined_diversity = pd.concat(diversity_chunks, axis=0)
return combined_diversity
else:
return quick_diversity_analysis(data)
# Memory-efficient analysis
large_diversity = handle_large_dataset(data, chunk_size=25)
print(f"Large dataset analysis completed: {large_diversity.shape}")
This manual provides complete, verified examples for all major VegZ functionality:
calculate_diversity(), pca_analysis(), nmds_analysis(), kmeans_clustering()calculate_all_indices(), calculate_index(), hill_numbers()pca_analysis(), correspondence_analysis(), detrended_correspondence_analysis(), cca_analysis(), redundancy_analysis(), principal_coordinates_analysis()kmeans_clustering(), fuzzy_cmeans_clustering(), comprehensive_elbow_analysis()permanova(), anosim(), mantel_test(), indicator_species_analysis()spatial_interpolation(), spatial_autocorrelation()trend_detection(), phenology_modeling(), seasonal_decomposition()biomass_prediction(), community_classification(), species_identification()create_diversity_dashboard(), create_ordination_dashboard()validate_coordinates(), detect_geographic_outliers()validate_species_name(), batch_validate_names()resolve_names(), resolve_from_file(), resolve_dataframe(), export_results()All examples in this manual use the correct syntax and will run successfully with the VegZ package.