From 22419d1f3ed66048dba6861452ea5cdc8ae4bb9f Mon Sep 17 00:00:00 2001 From: Jo <10510431+j178@users.noreply.github.com> Date: Tue, 18 Feb 2025 21:05:08 +0800 Subject: [PATCH] Simplify archive extraction implementation (#193) --- src/archive.rs | 198 ++++++++++++------------------------------------- 1 file changed, 47 insertions(+), 151 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 9a22a4de..4e87e26c 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -23,10 +23,12 @@ use std::collections::HashSet; use std::ffi::OsString; use std::fmt::{Display, Formatter}; use std::path::{Component, Path, PathBuf}; -use std::pin::Pin; -use futures::StreamExt; -use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; +use async_compression::tokio::bufread::{GzipDecoder, XzDecoder}; +use async_zip::base::read::stream::ZipFileReader; +use tokio::io::{AsyncRead, BufReader}; +use tokio_tar::ArchiveBuilder; +use tokio_util::compat::FuturesAsyncReadCompatExt; use tracing::warn; #[derive(Debug, thiserror::Error)] @@ -128,10 +130,7 @@ pub fn strip_component(source: impl AsRef) -> Result { /// This is useful for unzipping files as they're being downloaded. If the archive /// is already fully on disk, consider using `unzip_archive`, which can use multiple /// threads to work faster in that case. -pub async fn unzip( - reader: R, - target: impl AsRef, -) -> Result<(), Error> { +pub async fn unzip(reader: R, target: impl AsRef) -> Result<(), Error> { /// Ensure the file path is safe to use as a [`Path`]. /// /// See: @@ -153,8 +152,8 @@ pub async fn unzip( } let target = target.as_ref(); - let mut reader = futures::io::BufReader::with_capacity(DEFAULT_BUF_SIZE, reader.compat()); - let mut zip = async_zip::base::read::stream::ZipFileReader::new(&mut reader); + let mut reader = BufReader::with_capacity(DEFAULT_BUF_SIZE, reader); + let mut zip = ZipFileReader::with_tokio(&mut reader); let mut directories = HashSet::new(); @@ -208,10 +207,13 @@ pub async fn unzip( // which indicates the first entry in the central directory. So we continue reading from there. #[cfg(unix)] { + use async_zip::base::read::cd::CentralDirectoryReader; use std::fs::Permissions; use std::os::unix::fs::PermissionsExt; + use tokio_util::compat::TokioAsyncReadCompatExt; - let mut directory = async_zip::base::read::cd::CentralDirectoryReader::new(&mut reader); + let mut reader = reader.compat(); + let mut directory = CentralDirectoryReader::new(&mut reader); while let Some(entry) = directory.next().await? { if entry.dir()? { continue; @@ -221,181 +223,75 @@ pub async fn unzip( continue; }; - // The executable bit is the only permission we preserve, otherwise we use the OS defaults. - // https://github.com/pypa/pip/blob/3898741e29b7279e7bffe044ecfbe20f6a438b1e/src/pip/_internal/utils/unpacking.py#L88-L100 - let has_any_executable_bit = mode & 0o111; - if has_any_executable_bit != 0 { - // Construct the (expected) path to the file on-disk. - let path = entry.filename().as_str()?; - let Some(path) = enclosed_name(path) else { - continue; - }; - let path = target.join(path); - - let permissions = fs_err::tokio::metadata(&path).await?.permissions(); - if permissions.mode() & 0o111 != 0o111 { - fs_err::tokio::set_permissions( - &path, - Permissions::from_mode(permissions.mode() | 0o111), - ) - .await?; - } - } + // Construct the (expected) path to the file on-disk. + let path = entry.filename().as_str()?; + let Some(path) = enclosed_name(path) else { + continue; + }; + let path = target.join(path); + fs_err::tokio::set_permissions(&path, Permissions::from_mode(mode)).await?; } } Ok(()) } -/// Determine the path at which the given tar entry will be unpacked, when unpacking into `dst`. -/// -/// See: -#[cfg_attr(not(unix), allow(dead_code))] -fn unpacked_at(dst: &Path, entry: &Path) -> Option { - let mut file_dst = dst.to_path_buf(); - { - for part in entry.components() { - match part { - // Leading '/' characters, root paths, and '.' - // components are just ignored and treated as "empty - // components" - Component::Prefix(..) | Component::RootDir | Component::CurDir => { - continue; - } - - // If any part of the filename is '..', then skip over - // unpacking the file to prevent directory traversal - // security issues. See, e.g.: CVE-2001-1267, - // CVE-2002-0399, CVE-2005-1918, CVE-2007-4131 - Component::ParentDir => return None, - - Component::Normal(part) => file_dst.push(part), - } - } - } - - // Skip cases where only slashes or '.' parts were seen, because - // this is effectively an empty filename. - if *dst == *file_dst { - return None; - } - - // Skip entries without a parent (i.e. outside of FS root) - file_dst.parent()?; - - Some(file_dst) -} - -/// Unpack the given tar archive into the destination directory. -/// -/// This is equivalent to `archive.unpack_in(dst)`, but it also preserves the executable bit. -async fn untar_in( - mut archive: tokio_tar::Archive<&mut (dyn tokio::io::AsyncRead + Unpin)>, - dst: &Path, -) -> std::io::Result<()> { - let mut entries = archive.entries()?; - let mut pinned = Pin::new(&mut entries); - while let Some(entry) = pinned.next().await { - // Unpack the file into the destination directory. - let mut file = entry?; - - // On Windows, skip symlink entries, as they're not supported. pip recursively copies the - // symlink target instead. - if cfg!(windows) && file.header().entry_type().is_symlink() { - warn!( - "Skipping symlink in tar archive: {}", - file.path()?.display() - ); - continue; - } - - file.unpack_in(dst).await?; - - // Preserve the executable bit. - #[cfg(unix)] - { - use std::fs::Permissions; - use std::os::unix::fs::PermissionsExt; - - let entry_type = file.header().entry_type(); - if entry_type.is_file() || entry_type.is_hard_link() { - let mode = file.header().mode()?; - let has_any_executable_bit = mode & 0o111; - if has_any_executable_bit != 0 { - if let Some(path) = unpacked_at(dst, &file.path()?) { - let permissions = fs_err::tokio::metadata(&path).await?.permissions(); - if permissions.mode() & 0o111 != 0o111 { - fs_err::tokio::set_permissions( - &path, - Permissions::from_mode(permissions.mode() | 0o111), - ) - .await?; - } - } - } - } - } - } - Ok(()) -} - /// Unpack a `.tar.gz` archive into the target directory, without requiring `Seek`. /// /// This is useful for unpacking files as they're being downloaded. -pub async fn untar_gz( +pub async fn untar_gz( reader: R, target: impl AsRef, ) -> Result<(), Error> { - let reader = tokio::io::BufReader::with_capacity(DEFAULT_BUF_SIZE, reader); - let mut decompressed_bytes = async_compression::tokio::bufread::GzipDecoder::new(reader); + let reader = BufReader::with_capacity(DEFAULT_BUF_SIZE, reader); + let reader = GzipDecoder::new(reader); - let archive = tokio_tar::ArchiveBuilder::new( - &mut decompressed_bytes as &mut (dyn tokio::io::AsyncRead + Unpin), - ) - .set_preserve_mtime(false) - .build(); - Ok(untar_in(archive, target.as_ref()).await?) + let mut archive = ArchiveBuilder::new(reader) + .set_preserve_mtime(true) + .set_preserve_permissions(true) + .build(); + + archive.unpack(target.as_ref()).await?; + Ok(()) } /// Unpack a `.tar.xz` archive into the target directory, without requiring `Seek`. /// /// This is useful for unpacking files as they're being downloaded. -pub async fn untar_xz( +pub async fn untar_xz( reader: R, target: impl AsRef, ) -> Result<(), Error> { - let reader = tokio::io::BufReader::with_capacity(DEFAULT_BUF_SIZE, reader); - let mut decompressed_bytes = async_compression::tokio::bufread::XzDecoder::new(reader); + let reader = BufReader::with_capacity(DEFAULT_BUF_SIZE, reader); + let reader = XzDecoder::new(reader); - let archive = tokio_tar::ArchiveBuilder::new( - &mut decompressed_bytes as &mut (dyn tokio::io::AsyncRead + Unpin), - ) - .set_preserve_mtime(false) - .build(); - untar_in(archive, target.as_ref()).await?; + let mut archive = ArchiveBuilder::new(reader) + .set_preserve_mtime(true) + .set_preserve_permissions(true) + .build(); + + archive.unpack(target.as_ref()).await?; Ok(()) } /// Unpack a `.tar` archive into the target directory, without requiring `Seek`. /// /// This is useful for unpacking files as they're being downloaded. -pub async fn untar( - reader: R, - target: impl AsRef, -) -> Result<(), Error> { - let mut reader = tokio::io::BufReader::with_capacity(DEFAULT_BUF_SIZE, reader); +pub async fn untar(reader: R, target: impl AsRef) -> Result<(), Error> { + let reader = BufReader::with_capacity(DEFAULT_BUF_SIZE, reader); - let archive = - tokio_tar::ArchiveBuilder::new(&mut reader as &mut (dyn tokio::io::AsyncRead + Unpin)) - .set_preserve_mtime(false) - .build(); - untar_in(archive, target.as_ref()).await?; + let mut archive = ArchiveBuilder::new(reader) + .set_preserve_mtime(true) + .set_preserve_permissions(true) + .build(); + + archive.unpack(target.as_ref()).await?; Ok(()) } /// Unpack a `.zip`, `.tar.gz`, `.tar.bz2`, `.tar.zst`, or `.tar.xz` archive into the target directory, /// without requiring `Seek`. -pub async fn unpack( +pub async fn unpack( reader: R, ext: ArchiveExtension, target: impl AsRef,