From a727a5a0847c4e9e6919d9ef75c7738ea06e3469 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 26 Dec 2024 16:05:11 +0100 Subject: [PATCH 1/7] Update pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7ab9cd2c17669..52216049359cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] aws = ['s3fs>=2022.11.0'] gcp = ['gcsfs>=2022.11.0'] +hf = ['huggingface-hub>=0.27.0'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.0', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.5'] parquet = ['pyarrow>=10.0.1'] feather = ['pyarrow>=10.0.1'] From ca6de8c06e7e885f518ccc348c63eadab06cdcb7 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 26 Dec 2024 16:09:12 +0100 Subject: [PATCH 2/7] Update install.rst --- doc/source/getting_started/install.rst | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index bda959f380e8a..c3b1596d91301 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -324,16 +324,17 @@ Dependency Minimum Version pip ex Access data in the cloud ^^^^^^^^^^^^^^^^^^^^^^^^ -Installable with ``pip install "pandas[fss, aws, gcp]"`` - -============================================ ================== =============== ========================================================== -Dependency Minimum Version pip extra Notes -============================================ ================== =============== ========================================================== -`fsspec `__ 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required - dependency of s3fs, gcsfs). -`gcsfs `__ 2022.11.0 gcp Google Cloud Storage access -`s3fs `__ 2022.11.0 aws Amazon S3 access -============================================ ================== =============== ========================================================== +Installable with ``pip install "pandas[fss, aws, gcp, hf]"`` + +===================================================================== ================== =============== ========================================================== +Dependency Minimum Version pip extra Notes +===================================================================== ================== =============== ========================================================== +`fsspec `__ 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required + dependency of s3fs, gcsfs). +`gcsfs `__ 2022.11.0 gcp Google Cloud Storage access +`s3fs `__ 2022.11.0 aws Amazon S3 access +`huggingface-hub `__ 0.27.0 hf Hugging Face Hub access +===================================================================== ================== =============== ========================================================== Clipboard ^^^^^^^^^ From f1b7cc5e0034dc94fd5bda95fbf0e3bdf026e35a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 26 Dec 2024 16:11:28 +0100 Subject: [PATCH 3/7] Update io.rst --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7c165c87adb46..ec46ebab262d6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1565,7 +1565,7 @@ of header key value mappings to the ``storage_options`` keyword argument as show All URLs which are not local files or HTTP(s) are handled by `fsspec`_, if installed, and its various filesystem implementations -(including Amazon S3, Google Cloud, SSH, FTP, webHDFS...). +(including Amazon S3, Google Cloud, Hugging Face, SSH, FTP, webHDFS...). Some of these implementations will require additional packages to be installed, for example S3 URLs require the `s3fs From a40a166c9ef4f9a1eb332b67fe6aa5cb089d6811 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 30 Dec 2024 11:26:58 +0100 Subject: [PATCH 4/7] remove pip extra --- doc/source/getting_started/install.rst | 21 ++++++++++----------- pyproject.toml | 1 - 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index c3b1596d91301..bda959f380e8a 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -324,17 +324,16 @@ Dependency Minimum Version pip ex Access data in the cloud ^^^^^^^^^^^^^^^^^^^^^^^^ -Installable with ``pip install "pandas[fss, aws, gcp, hf]"`` - -===================================================================== ================== =============== ========================================================== -Dependency Minimum Version pip extra Notes -===================================================================== ================== =============== ========================================================== -`fsspec `__ 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required - dependency of s3fs, gcsfs). -`gcsfs `__ 2022.11.0 gcp Google Cloud Storage access -`s3fs `__ 2022.11.0 aws Amazon S3 access -`huggingface-hub `__ 0.27.0 hf Hugging Face Hub access -===================================================================== ================== =============== ========================================================== +Installable with ``pip install "pandas[fss, aws, gcp]"`` + +============================================ ================== =============== ========================================================== +Dependency Minimum Version pip extra Notes +============================================ ================== =============== ========================================================== +`fsspec `__ 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required + dependency of s3fs, gcsfs). +`gcsfs `__ 2022.11.0 gcp Google Cloud Storage access +`s3fs `__ 2022.11.0 aws Amazon S3 access +============================================ ================== =============== ========================================================== Clipboard ^^^^^^^^^ diff --git a/pyproject.toml b/pyproject.toml index 52216049359cc..7ab9cd2c17669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,6 @@ computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] aws = ['s3fs>=2022.11.0'] gcp = ['gcsfs>=2022.11.0'] -hf = ['huggingface-hub>=0.27.0'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.0', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.5'] parquet = ['pyarrow>=10.0.1'] feather = ['pyarrow>=10.0.1'] From c6064b4d2f4e80bdc8932b271c9c0ba4b391ab81 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 30 Dec 2024 12:37:52 +0100 Subject: [PATCH 5/7] Update ecosystem.md --- web/pandas/community/ecosystem.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 6c69ff7602491..14b4be13cf15b 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -468,6 +468,29 @@ df.dtypes ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/). +### [Hugging Face](https://huggingface.co/datasets) + +The Hugging Face Dataset Hub provides a large collection of ready-to-use datasets for machine learning shared by the community. The platform offers a user-friendly interface to explore, discover and visualize datasets, and provides tools to easily load and work with these datasets in Python thanks to the [huggingface_hub](https://github.com/huggingface/huggingface_hub) library. + +You can access datasets on Hugging Face using `hf://` paths in pandas, in the form `hf://datasets/username/dataset_name/...`. + +For example, here is how to load the [stanfordnlp/imdb dataset](https://huggingface.co/datasets/stanfordnlp/imdb): + +```python +import pandas as pd + +# Load the IMDB dataset +df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet") +``` + +Tip: on a dataset page, click on "Use this dataset" to get the code to load it in pandas. + +To save a dataset on Hugging Face you need to [create a public or private dataset](https://huggingface.co/new-dataset) and [login](https://huggingface.co/docs/huggingface_hub/quick-start#login-command), and then you can use `df.to_csv/to_json/to_parquet`: + +```python +# Save the dataset to my Hugging Face account +df.to_parquet("hf://datasets/username/dataset_name/train.parquet") +``` ## Out-of-core From 1aa31f1f4af5c90cb233d25c4bd7a820eab22688 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 30 Dec 2024 12:40:57 +0100 Subject: [PATCH 6/7] link to docs --- web/pandas/community/ecosystem.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 14b4be13cf15b..dc7b9bc947214 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -492,6 +492,8 @@ To save a dataset on Hugging Face you need to [create a public or private datase df.to_parquet("hf://datasets/username/dataset_name/train.parquet") ``` +You can find more information about the Hugging Face Dataset Hub in the [documentation](https://huggingface.co/docs/hub/en/datasets). + ## Out-of-core ### [Bodo](https://bodo.ai/) From 318d378320105bd5de20935d6015743226ecc1ad Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 31 Dec 2024 17:37:30 +0100 Subject: [PATCH 7/7] Revert change in io.rst --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ec46ebab262d6..7c165c87adb46 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1565,7 +1565,7 @@ of header key value mappings to the ``storage_options`` keyword argument as show All URLs which are not local files or HTTP(s) are handled by `fsspec`_, if installed, and its various filesystem implementations -(including Amazon S3, Google Cloud, Hugging Face, SSH, FTP, webHDFS...). +(including Amazon S3, Google Cloud, SSH, FTP, webHDFS...). Some of these implementations will require additional packages to be installed, for example S3 URLs require the `s3fs