1
0
mirror of https://github.com/j178/prek.git synced 2026-04-25 02:11:36 +02:00

Simplify archive extraction implementation (#193)

This commit is contained in:
Jo
2025-02-18 21:05:08 +08:00
committed by GitHub
parent b6bb23f74c
commit 22419d1f3e
+47 -151
View File
@@ -23,10 +23,12 @@ use std::collections::HashSet;
use std::ffi::OsString;
use std::fmt::{Display, Formatter};
use std::path::{Component, Path, PathBuf};
use std::pin::Pin;
use futures::StreamExt;
use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt};
use async_compression::tokio::bufread::{GzipDecoder, XzDecoder};
use async_zip::base::read::stream::ZipFileReader;
use tokio::io::{AsyncRead, BufReader};
use tokio_tar::ArchiveBuilder;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::warn;
#[derive(Debug, thiserror::Error)]
@@ -128,10 +130,7 @@ pub fn strip_component(source: impl AsRef<Path>) -> Result<PathBuf, Error> {
/// This is useful for unzipping files as they're being downloaded. If the archive
/// is already fully on disk, consider using `unzip_archive`, which can use multiple
/// threads to work faster in that case.
pub async fn unzip<R: tokio::io::AsyncRead + Unpin>(
reader: R,
target: impl AsRef<Path>,
) -> Result<(), Error> {
pub async fn unzip<R: AsyncRead + Unpin>(reader: R, target: impl AsRef<Path>) -> Result<(), Error> {
/// Ensure the file path is safe to use as a [`Path`].
///
/// See: <https://docs.rs/zip/latest/zip/read/struct.ZipFile.html#method.enclosed_name>
@@ -153,8 +152,8 @@ pub async fn unzip<R: tokio::io::AsyncRead + Unpin>(
}
let target = target.as_ref();
let mut reader = futures::io::BufReader::with_capacity(DEFAULT_BUF_SIZE, reader.compat());
let mut zip = async_zip::base::read::stream::ZipFileReader::new(&mut reader);
let mut reader = BufReader::with_capacity(DEFAULT_BUF_SIZE, reader);
let mut zip = ZipFileReader::with_tokio(&mut reader);
let mut directories = HashSet::new();
@@ -208,10 +207,13 @@ pub async fn unzip<R: tokio::io::AsyncRead + Unpin>(
// which indicates the first entry in the central directory. So we continue reading from there.
#[cfg(unix)]
{
use async_zip::base::read::cd::CentralDirectoryReader;
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
use tokio_util::compat::TokioAsyncReadCompatExt;
let mut directory = async_zip::base::read::cd::CentralDirectoryReader::new(&mut reader);
let mut reader = reader.compat();
let mut directory = CentralDirectoryReader::new(&mut reader);
while let Some(entry) = directory.next().await? {
if entry.dir()? {
continue;
@@ -221,181 +223,75 @@ pub async fn unzip<R: tokio::io::AsyncRead + Unpin>(
continue;
};
// The executable bit is the only permission we preserve, otherwise we use the OS defaults.
// https://github.com/pypa/pip/blob/3898741e29b7279e7bffe044ecfbe20f6a438b1e/src/pip/_internal/utils/unpacking.py#L88-L100
let has_any_executable_bit = mode & 0o111;
if has_any_executable_bit != 0 {
// Construct the (expected) path to the file on-disk.
let path = entry.filename().as_str()?;
let Some(path) = enclosed_name(path) else {
continue;
};
let path = target.join(path);
let permissions = fs_err::tokio::metadata(&path).await?.permissions();
if permissions.mode() & 0o111 != 0o111 {
fs_err::tokio::set_permissions(
&path,
Permissions::from_mode(permissions.mode() | 0o111),
)
.await?;
}
}
// Construct the (expected) path to the file on-disk.
let path = entry.filename().as_str()?;
let Some(path) = enclosed_name(path) else {
continue;
};
let path = target.join(path);
fs_err::tokio::set_permissions(&path, Permissions::from_mode(mode)).await?;
}
}
Ok(())
}
/// Determine the path at which the given tar entry will be unpacked, when unpacking into `dst`.
///
/// See: <https://github.com/vorot93/tokio-tar/blob/87338a76092330bc6fe60de95d83eae5597332e1/src/entry.rs#L418>
#[cfg_attr(not(unix), allow(dead_code))]
fn unpacked_at(dst: &Path, entry: &Path) -> Option<PathBuf> {
let mut file_dst = dst.to_path_buf();
{
for part in entry.components() {
match part {
// Leading '/' characters, root paths, and '.'
// components are just ignored and treated as "empty
// components"
Component::Prefix(..) | Component::RootDir | Component::CurDir => {
continue;
}
// If any part of the filename is '..', then skip over
// unpacking the file to prevent directory traversal
// security issues. See, e.g.: CVE-2001-1267,
// CVE-2002-0399, CVE-2005-1918, CVE-2007-4131
Component::ParentDir => return None,
Component::Normal(part) => file_dst.push(part),
}
}
}
// Skip cases where only slashes or '.' parts were seen, because
// this is effectively an empty filename.
if *dst == *file_dst {
return None;
}
// Skip entries without a parent (i.e. outside of FS root)
file_dst.parent()?;
Some(file_dst)
}
/// Unpack the given tar archive into the destination directory.
///
/// This is equivalent to `archive.unpack_in(dst)`, but it also preserves the executable bit.
async fn untar_in(
mut archive: tokio_tar::Archive<&mut (dyn tokio::io::AsyncRead + Unpin)>,
dst: &Path,
) -> std::io::Result<()> {
let mut entries = archive.entries()?;
let mut pinned = Pin::new(&mut entries);
while let Some(entry) = pinned.next().await {
// Unpack the file into the destination directory.
let mut file = entry?;
// On Windows, skip symlink entries, as they're not supported. pip recursively copies the
// symlink target instead.
if cfg!(windows) && file.header().entry_type().is_symlink() {
warn!(
"Skipping symlink in tar archive: {}",
file.path()?.display()
);
continue;
}
file.unpack_in(dst).await?;
// Preserve the executable bit.
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
let entry_type = file.header().entry_type();
if entry_type.is_file() || entry_type.is_hard_link() {
let mode = file.header().mode()?;
let has_any_executable_bit = mode & 0o111;
if has_any_executable_bit != 0 {
if let Some(path) = unpacked_at(dst, &file.path()?) {
let permissions = fs_err::tokio::metadata(&path).await?.permissions();
if permissions.mode() & 0o111 != 0o111 {
fs_err::tokio::set_permissions(
&path,
Permissions::from_mode(permissions.mode() | 0o111),
)
.await?;
}
}
}
}
}
}
Ok(())
}
/// Unpack a `.tar.gz` archive into the target directory, without requiring `Seek`.
///
/// This is useful for unpacking files as they're being downloaded.
pub async fn untar_gz<R: tokio::io::AsyncRead + Unpin>(
pub async fn untar_gz<R: AsyncRead + Unpin>(
reader: R,
target: impl AsRef<Path>,
) -> Result<(), Error> {
let reader = tokio::io::BufReader::with_capacity(DEFAULT_BUF_SIZE, reader);
let mut decompressed_bytes = async_compression::tokio::bufread::GzipDecoder::new(reader);
let reader = BufReader::with_capacity(DEFAULT_BUF_SIZE, reader);
let reader = GzipDecoder::new(reader);
let archive = tokio_tar::ArchiveBuilder::new(
&mut decompressed_bytes as &mut (dyn tokio::io::AsyncRead + Unpin),
)
.set_preserve_mtime(false)
.build();
Ok(untar_in(archive, target.as_ref()).await?)
let mut archive = ArchiveBuilder::new(reader)
.set_preserve_mtime(true)
.set_preserve_permissions(true)
.build();
archive.unpack(target.as_ref()).await?;
Ok(())
}
/// Unpack a `.tar.xz` archive into the target directory, without requiring `Seek`.
///
/// This is useful for unpacking files as they're being downloaded.
pub async fn untar_xz<R: tokio::io::AsyncRead + Unpin>(
pub async fn untar_xz<R: AsyncRead + Unpin>(
reader: R,
target: impl AsRef<Path>,
) -> Result<(), Error> {
let reader = tokio::io::BufReader::with_capacity(DEFAULT_BUF_SIZE, reader);
let mut decompressed_bytes = async_compression::tokio::bufread::XzDecoder::new(reader);
let reader = BufReader::with_capacity(DEFAULT_BUF_SIZE, reader);
let reader = XzDecoder::new(reader);
let archive = tokio_tar::ArchiveBuilder::new(
&mut decompressed_bytes as &mut (dyn tokio::io::AsyncRead + Unpin),
)
.set_preserve_mtime(false)
.build();
untar_in(archive, target.as_ref()).await?;
let mut archive = ArchiveBuilder::new(reader)
.set_preserve_mtime(true)
.set_preserve_permissions(true)
.build();
archive.unpack(target.as_ref()).await?;
Ok(())
}
/// Unpack a `.tar` archive into the target directory, without requiring `Seek`.
///
/// This is useful for unpacking files as they're being downloaded.
pub async fn untar<R: tokio::io::AsyncRead + Unpin>(
reader: R,
target: impl AsRef<Path>,
) -> Result<(), Error> {
let mut reader = tokio::io::BufReader::with_capacity(DEFAULT_BUF_SIZE, reader);
pub async fn untar<R: AsyncRead + Unpin>(reader: R, target: impl AsRef<Path>) -> Result<(), Error> {
let reader = BufReader::with_capacity(DEFAULT_BUF_SIZE, reader);
let archive =
tokio_tar::ArchiveBuilder::new(&mut reader as &mut (dyn tokio::io::AsyncRead + Unpin))
.set_preserve_mtime(false)
.build();
untar_in(archive, target.as_ref()).await?;
let mut archive = ArchiveBuilder::new(reader)
.set_preserve_mtime(true)
.set_preserve_permissions(true)
.build();
archive.unpack(target.as_ref()).await?;
Ok(())
}
/// Unpack a `.zip`, `.tar.gz`, `.tar.bz2`, `.tar.zst`, or `.tar.xz` archive into the target directory,
/// without requiring `Seek`.
pub async fn unpack<R: tokio::io::AsyncRead + Unpin>(
pub async fn unpack<R: AsyncRead + Unpin>(
reader: R,
ext: ArchiveExtension,
target: impl AsRef<Path>,