I have an issue with Dask. I have checked the csv file and everything is OK, I do not upload it because it is confidential. But maybe you can try your own CSV and see that you get the same error.
My code is bellow:
from dask.distributed import Client
client = Client(n_workers=4)
client
import dask.dataframe as dd
df = dd.read_csv('merged_data.csv')
X=df[['Mp10','Mp10_cal','Mp2_5','Mp2_5_cal','Humedad','Temperatura']]
y = df['Sector']
from dask_ml.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, shuffle=False)
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search
param_grid = {
'bootstrap': [True],
'max_depth': [80, 90, 100, 110],
'max_features': [2, 3],
'min_samples_leaf': [3, 4, 5],
'min_samples_split': [8, 10, 12],
'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
cv = 5, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train).compute()```
Bellow the error:
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_17712/1827769193.py in <module>
----> 1 grid_search.fit(X_train, y_train).compute()
C:\WORKSPACE\DataLab\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
889 return results
890
--> 891 self._run_search(evaluate_candidates)
892
893 # multimetric is determined here because in the case of a callable
C:\WORKSPACE\DataLab\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1390 def _run_search(self, evaluate_candidates):
1391 """Search all candidates in param_grid"""
-> 1392 evaluate_candidates(ParameterGrid(self.param_grid))
1393
1394
C:\WORKSPACE\DataLab\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
836 )
837
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
C:\WORKSPACE\DataLab\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1054
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
C:\WORKSPACE\DataLab\lib\site-packages\joblib\parallel.py in retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
C:\WORKSPACE\DataLab\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\_base.py in result(self, timeout)
443 raise CancelledError()
444 elif self._state == FINISHED:
--> 445 return self.__get_result()
446 else:
447 raise TimeoutError()
~\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\_base.py in __get_result(self)
388 if self._exception:
389 try:
--> 390 raise self._exception
391 finally:
392 # Break a reference cycle with the exception in self._exception
AttributeError: 'DataFrame' object has no attribute 'take'
dask_ml.datasets.make_classificationto create a synthetic dataset for this question.