1
0
mirror of https://github.com/MontFerret/ferret.git synced 2025-03-17 21:18:37 +02:00

setting Input charset (#609)

* feat(driver): add Charset param

* feat(driver): add convert to UTF8 by input charset
This commit is contained in:
Roman 2021-04-09 01:35:29 +03:00 committed by GitHub
parent 6645ff1521
commit 08e9054ba4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 95 additions and 3 deletions

2
go.mod
View File

@ -25,6 +25,6 @@ require (
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd
golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a
golang.org/x/sys v0.0.0-20200420163511-1957bb5e6d1f // indirect
golang.org/x/text v0.3.2 // indirect
golang.org/x/text v0.3.2
gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
)

View File

@ -4,9 +4,12 @@ import (
"bytes"
"context"
"github.com/gobwas/glob"
"io"
"net/http"
"net/url"
"golang.org/x/net/html/charset"
"github.com/PuerkitoBio/goquery"
"github.com/pkg/errors"
"github.com/sethgrid/pester"
@ -130,8 +133,15 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
return nil, errors.New(resp.Status)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
body := io.Reader(resp.Body)
if params.Charset != "" {
body, err = drv.convertToUTF8(body, params.Charset)
if err != nil {
return nil, errors.Wrapf(err, "failed convert to UTF-8 a document %s", params.URL)
}
}
doc, err := goquery.NewDocumentFromReader(body)
if err != nil {
return nil, errors.Wrapf(err, "failed to parse a document %s", params.URL)
}
@ -209,3 +219,12 @@ func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers
return allowed
}
func (drv *Driver) convertToUTF8(reader io.Reader, srcCharset string) (data io.Reader, err error) {
data, err = charset.NewReader(reader, srcCharset)
if err != nil {
return nil, err
}
return
}

View File

@ -1,13 +1,18 @@
package http
import (
"bytes"
"crypto/tls"
"github.com/MontFerret/ferret/pkg/drivers"
"io"
"io/ioutil"
"net/http"
"reflect"
"testing"
"unsafe"
"golang.org/x/text/encoding/charmap"
"github.com/smartystreets/goconvey/convey"
)
@ -106,5 +111,61 @@ func Test_newHTTPClient(t *testing.T) {
convey.So(hc, convey.ShouldNotBeNil)
})
}
func TestDriver_convertToUTF8(t *testing.T) {
type args struct {
inputData string
srcCharset string
}
tests := []struct {
name string
args args
wantData io.Reader
expected string
wantErr bool
}{
{
name: "should convert to expected state",
args: args{
inputData: `<!DOCTYPE html><html><head><meta charset="windows-1251"/></head><body>феррет</body></html>`,
srcCharset: "windows-1251",
},
wantErr: false,
expected: `<!DOCTYPE html><html><head><meta charset="windows-1251"/></head><body>феррет</body></html>`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
drv := &Driver{}
convey.Convey(tt.name, t, func() {
data, err := ioutil.ReadAll(bytes.NewBufferString(tt.args.inputData))
if err != nil {
panic(err)
}
encodedData := make([]byte, len(data)*2)
dec := charmap.Windows1251.NewEncoder()
nDst, _, err := dec.Transform(encodedData, data, false)
if err != nil {
panic(err)
}
encodedData = encodedData[:nDst]
gotData, err := drv.convertToUTF8(bytes.NewReader(encodedData), tt.args.srcCharset)
convey.So(err, convey.ShouldBeNil)
outData, err := ioutil.ReadAll(gotData)
convey.So(err, convey.ShouldBeNil)
convey.So(string(outData), convey.ShouldEqual, tt.expected)
})
})
}
}

View File

@ -31,6 +31,7 @@ type (
Cookies *HTTPCookies
Headers *HTTPHeaders
Viewport *Viewport
Charset string
Ignore *Ignore
}

View File

@ -50,6 +50,7 @@ type PageLoadParams struct {
// @param {Float} [params.viewport.scaleFactor] - Viewport scale factor.
// @param {Boolean} [params.viewport.mobile] - Value that indicates whether to emulate mobile device.
// @param {Boolean} [params.viewport.landscape] - Value that indicates whether to render a page in landscape position.
// @param {String} [params.charset] - (only HTTPDriver) Source charset content to convert UTF-8.
// @return {HTMLPage} - Loaded HTML page.
func Open(ctx context.Context, args ...core.Value) (core.Value, error) {
err := core.ValidateArgs(args, 1, 2)
@ -215,6 +216,16 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error
res.Ignore = ignore
}
charset, exists := obj.Get(values.NewString("charset"))
if exists {
if err := core.ValidateType(charset, types.String); err != nil {
return res, err
}
res.Charset = charset.String()
}
case types.String:
res.Driver = arg.(values.String).String()
case types.Boolean: