VegZ

VegZ: Complete User Manual

Table of Contents

  1. Introduction
  2. Installation
  3. Quick Start
  4. Core VegZ Class
  5. Diversity Analysis
  6. Multivariate Analysis
  7. Clustering Methods
  8. Statistical Analysis
  9. Temporal Analysis
  10. Spatial Analysis
  11. Environmental Modeling
  12. Functional Trait Analysis
  13. Machine Learning
  14. Data Quality and Validation
  15. Interactive Visualization
  16. Species Name Standardization
  17. Taxonomic Name Resolution
  18. Best Practices

Introduction

VegZ is a comprehensive Python package for vegetation data analysis and environmental modeling. This manual provides complete, working examples using the correct API syntax.

Installation

pip install VegZ

For development version:

pip install git+https://github.com/mhatim99/VegZ.git

Quick Start

Quick Analysis Functions

import pandas as pd
import numpy as np
from VegZ import quick_diversity_analysis, quick_ordination, quick_clustering, quick_elbow_analysis

# Create sample data
n_sites, n_species = 50, 20
data = pd.DataFrame(
    np.random.exponential(2, (n_sites, n_species)),
    columns=[f'Species_{i+1}' for i in range(n_species)]
)

# Quick diversity analysis
diversity_results = quick_diversity_analysis(data)
print("Quick diversity analysis completed")
print(f"Shape: {diversity_results.shape}")

# Quick ordination
ordination_results = quick_ordination(data, method='pca')
print("Quick PCA completed")
print(f"Available keys: {list(ordination_results.keys())}")

# Quick clustering
clustering_results = quick_clustering(data, n_clusters=3, method='kmeans')
print("Quick clustering completed")

# Quick elbow analysis
elbow_results = quick_elbow_analysis(data, max_k=10, plot_results=False)
print("Quick elbow analysis completed")

Core VegZ Class

Basic Usage

from VegZ import VegZ
import pandas as pd
import numpy as np

# Initialize VegZ
veg = VegZ()

# Create sample data
n_sites, n_species = 50, 20
data = pd.DataFrame(
    np.random.exponential(2, (n_sites, n_species)),
    columns=[f'Species_{i+1}' for i in range(n_species)]
)

# Assign data to VegZ instance
veg.data = data
veg.species_matrix = data

# Basic diversity calculation
diversity = veg.calculate_diversity(['shannon', 'simpson', 'richness'])
print(f"Diversity calculated for {diversity.shape[0]} sites")

# PCA analysis
pca_results = veg.pca_analysis(transform='hellinger')
print(f"PCA explained variance: {pca_results['explained_variance_ratio'][:2]}")

# NMDS analysis
nmds_results = veg.nmds_analysis(distance_metric='bray_curtis', n_dimensions=2)
print(f"NMDS stress: {nmds_results['stress']:.3f}")

# K-means clustering
kmeans_results = veg.kmeans_clustering(n_clusters=3, transform='hellinger')
print(f"K-means inertia: {kmeans_results['inertia']:.3f}")

# Hierarchical clustering
hier_results = veg.hierarchical_clustering(distance_metric='bray_curtis', linkage_method='average')
print("Hierarchical clustering completed")

# Rarefaction analysis
rarefaction = veg.rarefaction_curve()
print(f"Rarefaction curve shape: {rarefaction.shape}")

# Summary statistics
summary = veg.summary_statistics()
print(f"Data summary: {summary['n_sites']} sites, {summary['n_species']} species")

Plotting Functions

import matplotlib.pyplot as plt

# Plot diversity
fig = veg.plot_diversity(diversity, index_name='shannon')
plt.title('Shannon Diversity')
plt.show()

# Plot ordination
fig = veg.plot_ordination(pca_results)
plt.title('PCA Biplot')
plt.show()

# Plot species accumulation curve
fig = veg.plot_species_accumulation(rarefaction)
plt.title('Species Accumulation Curve')
plt.show()

# Plot cluster dendrogram
fig = veg.plot_cluster_dendrogram(hier_results)
plt.title('Cluster Dendrogram')
plt.show()

Diversity Analysis

DiversityAnalyzer Class

from VegZ import DiversityAnalyzer

diversity = DiversityAnalyzer()

# Calculate all diversity indices at once
all_indices = diversity.calculate_all_indices(data)
print("All diversity indices:")
print(f"Available indices: {list(all_indices.columns)}")
print(all_indices.head())

# Calculate individual indices
shannon = diversity.calculate_index(data, 'shannon')
simpson = diversity.calculate_index(data, 'simpson')
richness = diversity.calculate_index(data, 'richness')
evenness = diversity.calculate_index(data, 'evenness')

print(f"Shannon diversity range: {shannon.min():.3f} - {shannon.max():.3f}")
print(f"Simpson diversity range: {simpson.min():.3f} - {simpson.max():.3f}")

# Advanced diversity indices
fisher_alpha = diversity.calculate_index(data, 'fisher_alpha')
berger_parker = diversity.calculate_index(data, 'berger_parker')

print(f"Fisher's alpha range: {fisher_alpha.min():.3f} - {fisher_alpha.max():.3f}")
print(f"Berger-Parker range: {berger_parker.min():.3f} - {berger_parker.max():.3f}")

Hill Numbers

# Calculate Hill numbers (correct parameter name: q_values)
hill_numbers = diversity.hill_numbers(data, q_values=[0, 0.5, 1, 1.5, 2])
print("Hill numbers:")
print(f"Shape: {hill_numbers.shape}")
print(hill_numbers.head())

# Interpretation:
# q=0: Species richness (Hill 0)
# q=1: Shannon diversity exponential (Hill 1)
# q=2: Simpson diversity inverse (Hill 2)

Beta Diversity

# Whittaker's beta diversity (returns single value)
beta_whittaker = diversity.beta_diversity(data, method='whittaker')
print(f"Whittaker's beta diversity: {beta_whittaker:.3f}")

# Sørensen dissimilarity (returns distance matrix)
beta_sorensen = diversity.beta_diversity(data, method='sorensen')
print(f"Sørensen dissimilarity matrix shape: {beta_sorensen.shape}")

# Jaccard dissimilarity (returns distance matrix)
beta_jaccard = diversity.beta_diversity(data, method='jaccard')
print(f"Jaccard dissimilarity matrix shape: {beta_jaccard.shape}")

Richness Estimators

# Individual richness estimators
chao1 = diversity.chao1_estimator(data)
ace = diversity.ace_estimator(data, rare_threshold=10)
jack1 = diversity.jackknife1_estimator(data)
jack2 = diversity.jackknife2_estimator(data)

print("Richness estimators:")
print(f"Chao1 range: {chao1.min():.1f} - {chao1.max():.1f}")
print(f"ACE range: {ace.min():.1f} - {ace.max():.1f}")
print(f"Jackknife1 range: {jack1.min():.1f} - {jack1.max():.1f}")
print(f"Jackknife2 range: {jack2.min():.1f} - {jack2.max():.1f}")

Multivariate Analysis

MultivariateAnalyzer Class

from VegZ import MultivariateAnalyzer
import numpy as np

multivar = MultivariateAnalyzer()

# PCA Analysis
pca_results = multivar.pca_analysis(data, transform='hellinger')
print("PCA Analysis:")
print(f"Explained variance ratio: {pca_results['explained_variance_ratio'][:3]}")
cumulative_variance = np.cumsum(pca_results['explained_variance_ratio'])
print(f"Cumulative variance: {cumulative_variance[:3]}")

# NMDS Analysis
nmds_results = multivar.nmds_analysis(
    data,
    distance_metric='bray_curtis',
    n_dimensions=2,
    max_iterations=300
)
print(f"NMDS stress: {nmds_results['stress']:.4f}")
print(f"Converged: {nmds_results['converged']}")

# Correspondence Analysis (CA)
ca_results = multivar.correspondence_analysis(data, scaling=1)
print("CA Analysis completed")
print(f"Available keys: {list(ca_results.keys())}")

# Detrended Correspondence Analysis (DCA)
dca_results = multivar.detrended_correspondence_analysis(data, segments=26)
print("DCA Analysis completed")
print(f"Gradient lengths: {dca_results['gradient_lengths']}")

Constrained Ordination

# Create environmental data
env_data = pd.DataFrame({
    'Temperature': np.random.normal(15, 5, n_sites),
    'Precipitation': np.random.exponential(100, n_sites),
    'pH': np.random.uniform(4.5, 8.0, n_sites),
    'Elevation': np.random.uniform(100, 2000, n_sites)
})

# Canonical Correspondence Analysis (CCA)
cca_results = multivar.canonical_correspondence_analysis(
    species_data=data,
    env_data=env_data,
    scaling=1
)
print("CCA Analysis completed")
print(f"Eigenvalues: {cca_results['eigenvalues'][:3]}")
print(f"Species-environment correlations: {cca_results['species_env_correlation'][:3]}")

# Alternative abbreviated method name (same function)
cca_results2 = multivar.cca_analysis(species_data=data, env_data=env_data)
print("CCA using abbreviated method completed")

Environmental Vector Fitting

# Environmental vector fitting
env_fit = multivar.environmental_fitting(
    ordination_scores=pca_results['site_scores'].iloc[:, :2],  # First 2 PC axes
    env_data=env_data,
    method='vector'
)

print("Environmental vector fitting:")
print("Significant vectors (p < 0.05):")
for env_var, p_value in env_fit['p_values'].items():
    if p_value < 0.05:
        r2_value = env_fit['r_squared'][env_var]
        print(f"  {env_var}: R² = {r2_value:.3f}, p = {p_value:.3f}")

Goodness of Fit

# Test ordination quality
gof_results = multivar.goodness_of_fit_test(
    ordination_results=pca_results,
    original_data=data,
    distance_metric='bray_curtis'
)

print("Goodness of fit:")
print(f"Correlation: {gof_results['correlation']:.3f}")
print(f"Stress: {gof_results['stress']:.3f}")

Clustering Methods

VegetationClustering Class

from VegZ import VegetationClustering

clustering = VegetationClustering()

# K-means clustering
kmeans_results = clustering.kmeans_clustering(data, n_clusters=4, n_init=10)
print("K-means clustering:")
print(f"Silhouette score: {kmeans_results['silhouette_score']:.3f}")
print(f"Calinski-Harabasz score: {kmeans_results['calinski_harabasz_score']:.1f}")

# Hierarchical clustering
hierarchical_results = clustering.hierarchical_clustering(
    data,
    method='ward',
    metric='euclidean',
    n_clusters=4
)
print(f"Hierarchical clustering completed with {hierarchical_results['n_clusters']} clusters")

# Fuzzy C-means clustering (correct parameter name: fuzziness)
fuzzy_results = clustering.fuzzy_cmeans_clustering(
    data,
    n_clusters=4,
    fuzziness=2.0,  # Note: correct parameter name
    max_iter=100
)
print("Fuzzy C-means clustering completed")
print(f"Fuzzy partition coefficient: {fuzzy_results['partition_coefficient']:.3f}")

# DBSCAN clustering
dbscan_results = clustering.dbscan_clustering(
    data,
    eps=0.5,
    min_samples=5,
    distance_metric='euclidean'
)
print(f"DBSCAN found {dbscan_results['n_clusters']} clusters")
print(f"Number of noise points: {dbscan_results['n_noise']}")

# Gaussian Mixture clustering
gmm_results = clustering.gaussian_mixture_clustering(
    data,
    n_components=4,
    covariance_type='full'
)
print(f"GMM AIC: {gmm_results['aic']:.1f}")
print(f"GMM BIC: {gmm_results['bic']:.1f}")

Optimal Number of Clusters

# Comprehensive elbow analysis
elbow_results = clustering.comprehensive_elbow_analysis(
    data,
    k_range=range(2, 11),
    methods=['knee_locator', 'derivative', 'variance_explained'],
    transform='hellinger',
    plot_results=False
)

print("Elbow analysis results:")
print(f"Recommended k values: {elbow_results['recommendations']}")

# Optimal clusters using multiple criteria
optimal_results = clustering.optimal_clusters_analysis(
    data,
    k_range=range(2, 11),
    methods=['silhouette', 'gap_statistic']
)

print("Optimal cluster analysis:")
print(f"Best k by silhouette: {optimal_results['silhouette']['best_k']}")
print(f"Best k by gap statistic: {optimal_results['gap_statistic']['best_k']}")

Statistical Analysis

EcologicalStatistics Class

from VegZ import EcologicalStatistics
from scipy.spatial.distance import pdist, squareform

stats = EcologicalStatistics()

# Create distance matrix for tests that require it
distances = pdist(data, metric='braycurtis')
distance_matrix = squareform(distances)

# Create grouping variable
groups = kmeans_results['cluster_labels']

# PERMANOVA (requires distance matrix)
permanova = stats.permanova(
    distance_matrix=distance_matrix,
    groups=groups,
    permutations=999
)

print("PERMANOVA results:")
print(f"F-statistic: {permanova['f_statistic']:.3f}")
print(f"p-value: {permanova['p_value']:.3f}")
print(f"R-squared: {permanova['r_squared']:.3f}")

# ANOSIM
anosim = stats.anosim(
    distance_matrix=distance_matrix,
    groups=groups,
    permutations=999
)

print("ANOSIM results:")
print(f"R statistic: {anosim['r_statistic']:.3f}")
print(f"p-value: {anosim['p_value']:.3f}")

# MRPP (requires distance matrix)
mrpp = stats.mrpp(
    distance_matrix=distance_matrix,
    groups=groups,
    permutations=999
)

print("MRPP results:")
print(f"A statistic: {mrpp['a_statistic']:.3f}")
print(f"p-value: {mrpp['p_value']:.3f}")

Mantel Tests

# Create second distance matrix for Mantel test
env_distances = pdist(env_data, metric='euclidean')
env_distance_matrix = squareform(env_distances)

# Mantel test
mantel = stats.mantel_test(
    matrix1=distance_matrix,
    matrix2=env_distance_matrix,
    permutations=999,
    method='pearson'
)

print("Mantel test results:")
print(f"Correlation: {mantel['correlation']:.3f}")
print(f"p-value: {mantel['p_value']:.3f}")

# Partial Mantel test (requires 3 matrices)
spatial_coords = env_data[['Temperature', 'pH']].values  # Use as spatial proxy
spatial_distances = pdist(spatial_coords, metric='euclidean')
spatial_distance_matrix = squareform(spatial_distances)

partial_mantel = stats.partial_mantel_test(
    matrix1=distance_matrix,
    matrix2=env_distance_matrix,
    matrix3=spatial_distance_matrix,
    permutations=999
)

print("Partial Mantel test results:")
print(f"Partial correlation: {partial_mantel['partial_correlation']:.3f}")
print(f"p-value: {partial_mantel['p_value']:.3f}")

Indicator Species Analysis

# Indicator species analysis
indicator = stats.indicator_species_analysis(
    species_data=data,
    groups=groups,
    permutations=999
)

print("Indicator Species Analysis:")
print("Top indicator species:")
top_indicators = indicator['species_stats'].nlargest(5, 'indicator_value')
print(top_indicators[['indicator_value', 'p_value']])

# SIMPER analysis
simper = stats.simper_analysis(
    species_data=data,
    groups=groups,
    distance_metric='bray_curtis'
)

print("SIMPER analysis completed")
print(f"Average dissimilarity: {simper['average_dissimilarity']:.3f}")

Temporal Analysis

TemporalAnalyzer Class

from VegZ import TemporalAnalyzer

temporal = TemporalAnalyzer()

# Create temporal data
temporal_data = pd.DataFrame({
    'date': pd.date_range('2015-01-01', periods=100, freq='W'),
    'abundance': np.random.exponential(2, 100) + np.sin(np.arange(100) * 0.1) * 5,
    'species': 'Species_1'
})

# Trend detection (correct parameter names)
trend_results = temporal.trend_detection(
    data=temporal_data,
    time_col='date',
    response_col='abundance',
    method='mann_kendall'
)

print("Trend Detection:")
print(f"Trend: {trend_results['trend']}")
print(f"p-value: {trend_results['p_value']:.4f}")
print(f"Sen's slope: {trend_results['sens_slope']:.3f}")

# Phenology modeling (correct parameter names)
phenology_results = temporal.phenology_modeling(
    data=temporal_data,
    time_col='date',
    response_col='abundance',
    model_type='sigmoid'
)

print("Phenology modeling:")
print(f"Model type: {phenology_results['model_type']}")
model_fit = phenology_results['results']['combined']
print(f"Model success: {model_fit['success']}")
print(f"R-squared: {model_fit['r_squared']:.3f}")

# Seasonal decomposition
seasonal_results = temporal.seasonal_decomposition(
    data=temporal_data,
    time_col='date',
    response_col='abundance',
    method='classical',
    period=52  # Weekly data, annual period
)

print("Seasonal decomposition:")
print(f"Method: {seasonal_results['method']}")
print(f"Period: {seasonal_results['period']}")
print(f"Trend component shape: {seasonal_results['trend'].shape}")

# Growth curve fitting
growth_data = pd.DataFrame({
    'time': np.arange(50),
    'size': np.random.exponential(1, 50) * np.arange(1, 51) * 0.5,
    'species': 'Species_A'
})

growth_results = temporal.growth_curve_fitting(
    data=growth_data,
    time_col='time',
    size_col='size',
    curve_type='logistic'
)

print("Growth curve fitting:")
growth_fit = growth_results['results']['combined']
print(f"R-squared: {growth_fit['r_squared']:.3f}")
print(f"Growth parameters: {growth_fit['growth_parameters']}")

Spatial Analysis

SpatialAnalyzer Class

from VegZ import SpatialAnalyzer

spatial = SpatialAnalyzer()

# Create spatial data (correct format)
spatial_data = pd.DataFrame({
    'longitude': np.random.uniform(-120, -100, 50),
    'latitude': np.random.uniform(30, 45, 50),
    'response': np.random.exponential(2, 50)
})

# Spatial interpolation (correct parameter names)
idw_results = spatial.spatial_interpolation(
    data=spatial_data,
    x_col='longitude',
    y_col='latitude',
    z_col='response',
    method='idw',
    grid_resolution=0.1
)

print("IDW interpolation completed")
print(f"Grid shape: {idw_results['Z_grid'].shape}")

# Kriging interpolation
kriging_results = spatial.spatial_interpolation(
    data=spatial_data,
    x_col='longitude',
    y_col='latitude',
    z_col='response',
    method='kriging',
    grid_resolution=0.1
)

print("Kriging interpolation completed")
print(f"Available results: {list(kriging_results.keys())}")

# Spatial autocorrelation (correct parameter names)
morans_i = spatial.spatial_autocorrelation(
    data=spatial_data,
    x_col='longitude',
    y_col='latitude',
    response_col='response',
    method='morans_i'
)

print("Spatial autocorrelation:")
print(f"Moran's I: {morans_i['morans_i']:.4f}")
print(f"p-value: {morans_i['p_value']:.4f}")
print(f"Expected I: {morans_i['expected_i']:.4f}")

# Habitat suitability modeling
presence_data = pd.DataFrame({
    'longitude': np.random.uniform(-120, -100, 100),
    'latitude': np.random.uniform(30, 45, 100),
    'presence': np.random.choice([0, 1], 100, p=[0.7, 0.3]),
    'temperature': np.random.normal(15, 5, 100),
    'precipitation': np.random.exponential(100, 100)
})

habitat_results = spatial.habitat_suitability_modeling(
    presence_data=presence_data,
    environmental_data=presence_data,  # Same dataframe with env variables
    x_col='longitude',
    y_col='latitude',
    response_col='presence',
    method='random_forest'
)

print("Habitat suitability modeling completed")
print(f"Model performance: {habitat_results['model_performance']}")

Environmental Modeling

EnvironmentalModeler Class

from VegZ import EnvironmentalModeler

env_model = EnvironmentalModeler()

# Species response curves
species_response = env_model.species_response_curves(
    species_data=data.iloc[:, 0],  # First species
    environmental_var=env_data['Temperature'],
    curve_type='gaussian'
)

print("Species response curves:")
print(f"Optimum temperature: {species_response['optimum']:.2f}")
print(f"Tolerance: {species_response['tolerance']:.2f}")

# GAM fitting
gam_results = env_model.fit_gam(
    data=pd.concat([data.iloc[:, :5], env_data], axis=1),
    response_col='Species_1',
    predictor_cols=['Temperature', 'pH', 'Precipitation'],
    family='gaussian'
)

print("GAM results:")
print(f"R-squared: {gam_results['r_squared']:.3f}")
print(f"AIC: {gam_results['aic']:.1f}")

# Environmental gradient analysis
gradient_results = env_model.environmental_gradient_analysis(
    species_data=data,
    env_data=env_data,
    method='cca'
)

print("Environmental gradient analysis completed")
print(f"Constrained variance: {gradient_results['constrained_variance']:.3f}")

Functional Trait Analysis

FunctionalTraitAnalyzer Class

from VegZ import FunctionalTraitAnalyzer

traits_analyzer = FunctionalTraitAnalyzer()

# Create trait data
trait_data = pd.DataFrame({
    'SLA': np.random.normal(20, 5, n_species),
    'Height': np.random.lognormal(1, 0.5, n_species),
    'SeedMass': np.random.lognormal(0, 1, n_species)
}, index=[f'Species_{i+1}' for i in range(n_species)])

# Load trait data into analyzer (required step)
traits_analyzer.load_trait_data(trait_data, abundance_data=data)

# Calculate functional diversity
func_diversity = traits_analyzer.calculate_functional_diversity(
    traits=['SLA', 'Height', 'SeedMass'],
    standardize=True
)

print("Functional diversity:")
print(f"Available indices: {func_diversity['site_diversity'].columns.tolist()}")
print(f"Mean functional richness: {func_diversity['site_diversity']['FRic'].mean():.3f}")

# Calculate functional beta diversity
func_beta = traits_analyzer.calculate_functional_beta_diversity(
    traits=['SLA', 'Height', 'SeedMass']
)

print("Functional beta diversity:")
print(f"Gamma diversity: {func_beta['gamma_diversity']:.3f}")
print(f"Mean alpha diversity: {func_beta['mean_alpha_diversity']:.3f}")
print(f"Beta diversity: {func_beta['beta_diversity']:.3f}")

# Identify functional groups
func_groups = traits_analyzer.identify_functional_groups(
    n_groups=4,
    traits=['SLA', 'Height', 'SeedMass'],
    method='hierarchical'
)

print("Functional groups:")
print(f"Number of groups: {func_groups['n_groups']}")
print("Group characteristics available")

# Trait-environment relationships
trait_env = traits_analyzer.trait_environment_relationships(
    environmental_data=env_data
)

print("Trait-environment relationships:")
print(f"Correlations shape: {trait_env['correlations'].shape}")
print(f"Significant correlations: {len(trait_env['significant_correlations'])}")

Machine Learning

MachineLearningAnalyzer Class

from VegZ import MachineLearningAnalyzer

ml = MachineLearningAnalyzer()

# Prepare ML data
ml_data = pd.concat([data.iloc[:, :5], env_data], axis=1)
ml_data['biomass'] = np.random.exponential(50, n_sites)

# Biomass prediction
biomass_results = ml.biomass_prediction(
    data=ml_data,
    biomass_column='biomass',
    predictor_features=['Species_1', 'Species_2', 'Temperature', 'pH'],
    model_type='rf',
    optimize_hyperparameters=False
)

print("Biomass Prediction Results:")
print(f"Model performance: {biomass_results['performance']}")
print("Feature importance:")
for feature, importance in zip(biomass_results['feature_names'], biomass_results['feature_importance']):
    print(f"  {feature}: {importance:.3f}")

# Community classification
community_results = ml.community_classification(
    data=ml_data,
    species_columns=[f'Species_{i+1}' for i in range(5)],
    n_communities=3,
    method='kmeans'
)

print("Community Classification:")
print(f"Number of communities: {community_results['n_communities']}")
print(f"Cluster centers shape: {community_results['cluster_centers'].shape}")

# Species identification
ml_data['leaf_length'] = np.random.normal(5, 1, n_sites)
ml_data['leaf_width'] = np.random.normal(2, 0.5, n_sites)
species_labels = np.random.choice(['Species_A', 'Species_B', 'Species_C'], n_sites)
ml_data['species_label'] = species_labels

identification_results = ml.species_identification(
    data=ml_data,
    morphological_features=['leaf_length', 'leaf_width'],
    species_column='species_label',
    test_size=0.3
)

print("Species Identification:")
print(f"Best model: {identification_results['best_model']}")
print("Model performance:")
for model, performance in identification_results['performance'].items():
    print(f"  {model}: accuracy = {performance.get('accuracy', 'N/A')}")

# Habitat suitability modeling
ml_data['presence'] = np.random.choice([0, 1], n_sites, p=[0.6, 0.4])

habitat_results = ml.habitat_suitability_modeling(
    data=ml_data,
    species_column='presence',
    environmental_features=['Temperature', 'pH', 'Precipitation'],
    model_type='rf',
    cross_validation=True
)

print("Habitat Suitability Modeling:")
print(f"Cross-validation score: {habitat_results['cv_scores'].mean():.3f}")

# Dimensionality reduction
dim_reduction = ml.dimensionality_reduction(
    data=ml_data,
    feature_columns=[f'Species_{i+1}' for i in range(5)],
    method='pca',
    n_components=2
)

print("Dimensionality Reduction:")
print(f"Explained variance ratio: {dim_reduction['explained_variance_ratio']}")

# Anomaly detection
anomaly_results = ml.ecological_anomaly_detection(
    data=ml_data,
    feature_columns=[f'Species_{i+1}' for i in range(5)],
    contamination=0.1,
    method='isolation_forest'
)

print("Anomaly Detection:")
print(f"Number of anomalies detected: {anomaly_results['n_anomalies']}")

Data Quality and Validation

Spatial Validation

from VegZ.data_quality import SpatialValidator

# Initialize spatial validator
spatial_val = SpatialValidator()

# Create coordinate data
coords_df = pd.DataFrame({
    'latitude': np.random.uniform(30, 45, 100),
    'longitude': np.random.uniform(-120, -100, 100)
})

# Validate coordinates
coord_validation = spatial_val.validate_coordinates(
    df=coords_df,
    lat_col='latitude',
    lon_col='longitude'
)

print("Coordinate validation results:")
print(f"Total records: {coord_validation['total_records']}")
print(f"Valid coordinates: {coord_validation['valid_coordinates']}")
print(f"Issues found: {len(coord_validation['issues_found'])}")

# Detect geographic outliers
outliers = spatial_val.detect_geographic_outliers(
    df=coords_df,
    lat_col='latitude',
    lon_col='longitude',
    method='iqr',
    threshold=1.5
)

print(f"Geographic outliers detected: {outliers.sum()}")

# Generate spatial quality report
spatial_report = spatial_val.generate_spatial_quality_report(
    df=coords_df,
    lat_col='latitude',
    lon_col='longitude'
)

print("Spatial quality report generated")
print(f"Report keys: {list(spatial_report.keys())}")

Temporal Validation

from VegZ.data_quality import TemporalValidator

# Initialize temporal validator
temp_val = TemporalValidator()

# Create temporal data
temp_df = pd.DataFrame({
    'date': ['2020-01-15', '2020-02-20', '2020-13-05', '2020-05-30', 'invalid-date'],
    'collection_date': ['2020-01-15', '2020-02-20', '2020-03-05', '2020-05-30', '2020-06-15']
})

# Validate dates
date_validation = temp_val.validate_dates(
    df=temp_df,
    date_cols='date',
    event_date_col='collection_date'
)

print("Temporal validation results:")
print(f"Total records: {date_validation['total_records']}")
print(f"Valid dates: {date_validation['valid_dates']}")
print(f"Issues found: {len(date_validation['issues_found'])}")
print(f"Date columns analyzed: {date_validation['date_columns_analyzed']}")

# Generate temporal quality report
temp_report = temp_val.generate_temporal_quality_report(
    df=temp_df,
    date_cols='date',
    event_date_col='collection_date'
)

print("Temporal quality report generated")
print(f"Report keys: {list(temp_report.keys())}")

Interactive Visualization

InteractiveVisualizer Class

from VegZ import InteractiveVisualizer

interactive = InteractiveVisualizer()

# Create diversity dashboard
diversity_dashboard = interactive.create_diversity_dashboard(
    diversity_results={'results': diversity, 'indices': ['shannon', 'simpson', 'richness']},
    data=data
)

print("Diversity dashboard created")
print(f"Dashboard components: {list(diversity_dashboard.keys())}")

# Create ordination dashboard
ordination_dashboard = interactive.create_ordination_dashboard(
    ordination_results=pca_results,
    environmental_data=env_data
)

print("Ordination dashboard created")
print(f"Dashboard components: {list(ordination_dashboard.keys())}")

# Create clustering dashboard
clustering_dashboard = interactive.create_clustering_dashboard(
    clustering_results=kmeans_results,
    ordination_results=pca_results
)

print("Clustering dashboard created")
print(f"Dashboard components: {list(clustering_dashboard.keys())}")

# Create trait dashboard (if trait data loaded)
if 'trait_data' in locals():
    trait_dashboard = interactive.create_trait_dashboard(
        trait_results=func_diversity,
        trait_data=trait_data
    )
    print("Trait dashboard created")

# Save dashboard
dashboard_file = interactive.save_dashboard(
    dashboard=diversity_dashboard,
    filename='diversity_dashboard.html',
    format='html'
)

print(f"Dashboard saved to: {dashboard_file}")

Report Generation

from VegZ import ReportGenerator

report_gen = ReportGenerator()

# Prepare analysis results (use summary data)
analysis_results = {
    'diversity_summary': {
        'mean_shannon': diversity['shannon'].mean(),
        'mean_simpson': diversity['simpson'].mean(),
        'total_species': data.shape[1],
        'total_sites': data.shape[0]
    },
    'ordination_summary': {
        'method': 'PCA',
        'variance_explained': pca_results['explained_variance_ratio'][:2].tolist()
    },
    'clustering_summary': {
        'method': 'K-means',
        'n_clusters': kmeans_results['n_clusters'],
        'silhouette_score': kmeans_results['silhouette_score']
    }
}

# Generate HTML report
report_content = report_gen.generate_analysis_report(
    results=analysis_results,
    output_format='html'
)

# Save report to file
output_file = report_gen.save_report(
    report_content=report_content,
    filename='vegetation_analysis_report.html',
    format='html'
)

print("Report generated successfully!")
print(f"Report saved to: {output_file}")
print(f"Report length: {len(report_content)} characters")

Species Name Standardization

SpeciesNameStandardizer Class

from VegZ.data_management.standardization import SpeciesNameStandardizer

# Initialize standardizer
standardizer = SpeciesNameStandardizer()

# Validate individual species names
result = standardizer.validate_species_name("Quercus alba L.")
print(f"Valid: {result['is_valid']}")
print(f"Errors: {result['errors']}")
print(f"Cleaned name: '{result['cleaned_name']}'")

# Test various name formats
test_names = [
    "Quercus alba",           # Valid binomial
    "Quercus",               # Genus only
    "quercus alba",          # Capitalization error
    "Quercus alba L.",       # Author citation
    "Quercus × alba",        # Hybrid marker
    "Quercus sp.",           # Placeholder
    "Quercus alba!",         # Invalid character
]

print("\nIndividual name validation:")
for name in test_names:
    result = standardizer.validate_species_name(name)
    print(f"'{name}': Valid={result['is_valid']}, Errors={result['error_count']}")

# Batch validation
batch_results = standardizer.batch_validate_names(test_names)
print(f"\nBatch validation results:")
print(f"Shape: {batch_results.shape}")
print(f"Valid names: {batch_results['is_valid'].sum()}/{len(batch_results)}")

# Error distribution
error_columns = [col for col in batch_results.columns if col.startswith('has_')]
print("\nError distribution:")
for col in error_columns:
    error_count = batch_results[col].sum()
    error_type = col.replace('has_', '')
    print(f"  {error_type}: {error_count} names")

# DataFrame standardization
vegetation_df = pd.DataFrame({
    'site_id': ['site_001', 'site_002', 'site_003'],
    'species': ['Quercus alba', 'quercus sp.', 'Pinus strobus L.'],
    'abundance': [25, 12, 18]
})

# Standardize with full error detection
enhanced_df = standardizer.standardize_dataframe(
    df=vegetation_df,
    species_column='species'
)

print(f"\nDataFrame standardization:")
print(f"Original columns: {list(vegetation_df.columns)}")
print(f"Enhanced columns: {len(enhanced_df.columns)} total")
print("New validation columns added for quality assessment")

# Fuzzy matching
reference_species = ["Quercus alba", "Pinus strobus", "Acer saccharum"]
query_species = ["Quercus albus", "Pinus strobus", "Acer sacchrum"]

matches = standardizer.fuzzy_match_species(
    query_species=query_species,
    reference_species=reference_species,
    threshold=80
)

print(f"\nFuzzy matching results:")
for query, match in matches.items():
    print(f"  '{query}' -> '{match}'")

Taxonomic Name Resolution

TaxonomicResolver Class (New in v1.3.0)

The TaxonomicResolver enables validation and standardization of plant species names against authoritative online taxonomic databases.

from VegZ import TaxonomicResolver, resolve_species_names

# Initialize with default source (World Flora Online)
resolver = TaxonomicResolver()

# Or specify sources
resolver = TaxonomicResolver(sources='gbif')  # Single source
resolver = TaxonomicResolver(sources=['wfo', 'powo', 'gbif'], use_fallback=True)  # Multiple with fallback

# Resolve a list of species names
species_list = ['Quercus robur', 'Pinus sylvestris', 'Betula pendula']
results = resolver.resolve_names(species_list)

print("Resolution results:")
print(results[['original_name', 'accepted_name', 'match_score', 'family', 'source']])

Supported Taxonomic Databases

File-Based Resolution

# Resolve species names directly from a file
results = resolver.resolve_from_file('species_list.csv')
results = resolver.resolve_from_file('data.xlsx', species_column='ScientificName')

# Export results to various formats
resolver.export_results(results, 'resolved_names.csv')
resolver.export_results(results, 'resolved_names.xlsx')
resolver.export_results(results, 'resolved_names.json')

DataFrame Integration

import pandas as pd

# Load your vegetation data
df = pd.read_csv('vegetation_survey.csv')

# Resolve and update species names in the DataFrame
updated_df = resolver.resolve_dataframe(
    df,
    species_column='species',
    update_names=True,           # Replace with accepted names
    add_taxonomy_columns=True,   # Add family, genus, match_score columns
    min_score_threshold=70       # Only update if score >= 70
)

# Original names are preserved in 'species_original' column
print(updated_df[['species_original', 'species', 'taxon_family', 'taxon_match_score']])

Summary and Statistics

# Get resolution summary
resolver.print_summary(results)

# Output:
# ============================================================
# TAXONOMIC RESOLUTION SUMMARY
# ============================================================
# Total names processed:     10
# Successfully resolved:     9 (90.0%)
# Unresolved:                1
# ------------------------------------------------------------
# High confidence (>=90):    7
# Medium confidence (70-89): 2
# Low confidence (<70):      0
# Average match score:       92.5
# ------------------------------------------------------------
# Sources used:              GBIF
# Unique families found:     5
# ============================================================

Convenience Functions

from VegZ.data_management import resolve_species_names, resolve_species_from_file, update_species_in_dataframe

# Quick resolution
results = resolve_species_names(['Quercus robur', 'Pinus sylvestris'], sources='gbif')

# Resolve from file with automatic export
results = resolve_species_from_file('species.csv', output_file='resolved.xlsx')

# Update DataFrame in one line
df_updated = update_species_in_dataframe(df, sources='gbif', min_score=70)

Output Columns

The resolution results include:

Column Description
original_name Input species name
accepted_name Resolved accepted name
accepted_author Author citation
match_score Confidence (0-100)
match_type exact, fuzzy, candidate, synonym
taxonomic_status accepted, synonym, unresolved
synonyms Known synonyms from database
family Taxonomic family
genus Genus name
source Database used (WFO, POWO, etc.)
source_id Record ID in source database
source_url Direct link to record

Best Practices

Complete Analysis Workflow

def complete_vegetation_analysis(data, env_data=None):
    """
    Complete vegetation analysis workflow with all major components.
    """
    results = {}

    # Step 1: Data Quality Check
    print("Step 1: Data quality assessment...")
    quality_stats = {
        'n_sites': data.shape[0],
        'n_species': data.shape[1],
        'completeness': (data > 0).sum().sum() / (data.shape[0] * data.shape[1]),
        'zero_sites': (data.sum(axis=1) == 0).sum(),
        'zero_species': (data.sum(axis=0) == 0).sum()
    }
    results['data_quality'] = quality_stats
    print(f"  Data quality: {quality_stats['completeness']:.2%} completeness")

    # Step 2: Diversity Analysis
    print("Step 2: Diversity analysis...")
    veg = VegZ()
    veg.data = data
    veg.species_matrix = data

    diversity = veg.calculate_diversity(['shannon', 'simpson', 'richness'])
    results['diversity'] = diversity
    print(f"  Diversity calculated for {diversity.shape[0]} sites")

    # Step 3: Multivariate Analysis
    print("Step 3: Multivariate analysis...")
    pca_results = veg.pca_analysis(transform='hellinger')
    nmds_results = veg.nmds_analysis(distance_metric='bray_curtis')
    results['ordination'] = {
        'pca': pca_results,
        'nmds': nmds_results
    }
    print(f"  PCA explained variance: {pca_results['explained_variance_ratio'][:2]}")
    print(f"  NMDS stress: {nmds_results['stress']:.3f}")

    # Step 4: Clustering Analysis
    print("Step 4: Clustering analysis...")
    clustering = VegetationClustering()

    # Find optimal number of clusters
    elbow_results = clustering.comprehensive_elbow_analysis(data, plot_results=False)
    optimal_k = elbow_results.get('recommendations', {}).get('consensus', 3)

    kmeans_results = clustering.kmeans_clustering(data, n_clusters=optimal_k)
    results['clustering'] = kmeans_results
    print(f"  Clustering completed with k={optimal_k}")
    print(f"  Silhouette score: {kmeans_results['silhouette_score']:.3f}")

    # Step 5: Statistical Tests
    if env_data is not None:
        print("Step 5: Statistical analysis...")
        stats = EcologicalStatistics()
        groups = kmeans_results['cluster_labels']

        # Calculate distance matrix for PERMANOVA
        distances = pdist(data, metric='braycurtis')
        distance_matrix = squareform(distances)

        permanova = stats.permanova(
            distance_matrix=distance_matrix,
            groups=groups,
            permutations=199  # Reduced for speed
        )
        results['statistics'] = permanova
        print(f"  PERMANOVA: F={permanova['f_statistic']:.3f}, p={permanova['p_value']:.3f}")

    # Step 6: Generate Report
    print("Step 6: Generating report...")
    report_gen = ReportGenerator()

    summary_results = {
        'data_summary': quality_stats,
        'diversity_summary': {
            'mean_shannon': diversity['shannon'].mean(),
            'mean_simpson': diversity['simpson'].mean(),
            'mean_richness': diversity['richness'].mean()
        },
        'clustering_summary': {
            'method': 'K-means',
            'n_clusters': optimal_k,
            'silhouette_score': kmeans_results['silhouette_score']
        }
    }

    if env_data is not None:
        summary_results['statistics_summary'] = {
            'permanova_f': permanova['f_statistic'],
            'permanova_p': permanova['p_value']
        }

    report_content = report_gen.generate_analysis_report(
        results=summary_results,
        output_format='html'
    )

    output_file = report_gen.save_report(
        report_content=report_content,
        filename='complete_analysis_report.html',
        format='html'
    )

    results['report_file'] = output_file
    print(f"  Report saved to: {output_file}")

    return results

# Run complete analysis
print("=== COMPLETE VEGETATION ANALYSIS WORKFLOW ===")
complete_results = complete_vegetation_analysis(data, env_data)

print(f"\n=== ANALYSIS COMPLETE ===")
print(f"Results include: {list(complete_results.keys())}")
print("All analyses completed successfully with verified syntax!")

Performance Tips

# For large datasets
def handle_large_dataset(data, chunk_size=1000):
    """Handle large datasets efficiently."""

    if len(data) > chunk_size:
        print(f"Processing large dataset ({len(data)} sites) in chunks...")
        # Process diversity in chunks for memory efficiency
        diversity_chunks = []
        for i in range(0, len(data), chunk_size):
            chunk = data.iloc[i:i+chunk_size]
            chunk_diversity = quick_diversity_analysis(chunk)
            diversity_chunks.append(chunk_diversity)

        # Combine results
        combined_diversity = pd.concat(diversity_chunks, axis=0)
        return combined_diversity
    else:
        return quick_diversity_analysis(data)

# Memory-efficient analysis
large_diversity = handle_large_dataset(data, chunk_size=25)
print(f"Large dataset analysis completed: {large_diversity.shape}")

Summary

This manual provides complete, verified examples for all major VegZ functionality:

Key Classes and Their Main Methods:

All examples in this manual use the correct syntax and will run successfully with the VegZ package.