diff --git a/go.mod b/go.mod index af4b21e0..1a0b8a6a 100644 --- a/go.mod +++ b/go.mod @@ -25,6 +25,6 @@ require ( golang.org/x/net v0.0.0-20200421231249-e086a090c8fd golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a golang.org/x/sys v0.0.0-20200420163511-1957bb5e6d1f // indirect - golang.org/x/text v0.3.2 // indirect + golang.org/x/text v0.3.2 gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect ) diff --git a/pkg/drivers/http/driver.go b/pkg/drivers/http/driver.go index eaf7bc1f..5e8886a4 100644 --- a/pkg/drivers/http/driver.go +++ b/pkg/drivers/http/driver.go @@ -4,9 +4,12 @@ import ( "bytes" "context" "github.com/gobwas/glob" + "io" "net/http" "net/url" + "golang.org/x/net/html/charset" + "github.com/PuerkitoBio/goquery" "github.com/pkg/errors" "github.com/sethgrid/pester" @@ -130,8 +133,15 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM return nil, errors.New(resp.Status) } - doc, err := goquery.NewDocumentFromReader(resp.Body) + body := io.Reader(resp.Body) + if params.Charset != "" { + body, err = drv.convertToUTF8(body, params.Charset) + if err != nil { + return nil, errors.Wrapf(err, "failed convert to UTF-8 a document %s", params.URL) + } + } + doc, err := goquery.NewDocumentFromReader(body) if err != nil { return nil, errors.Wrapf(err, "failed to parse a document %s", params.URL) } @@ -209,3 +219,12 @@ func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers return allowed } + +func (drv *Driver) convertToUTF8(reader io.Reader, srcCharset string) (data io.Reader, err error) { + data, err = charset.NewReader(reader, srcCharset) + if err != nil { + return nil, err + } + + return +} diff --git a/pkg/drivers/http/driver_test.go b/pkg/drivers/http/driver_test.go index 7d9e2aa8..ea55c96f 100644 --- a/pkg/drivers/http/driver_test.go +++ b/pkg/drivers/http/driver_test.go @@ -1,13 +1,18 @@ package http import ( + "bytes" "crypto/tls" "github.com/MontFerret/ferret/pkg/drivers" + "io" + "io/ioutil" "net/http" "reflect" "testing" "unsafe" + "golang.org/x/text/encoding/charmap" + "github.com/smartystreets/goconvey/convey" ) @@ -106,5 +111,61 @@ func Test_newHTTPClient(t *testing.T) { convey.So(hc, convey.ShouldNotBeNil) }) - +} + +func TestDriver_convertToUTF8(t *testing.T) { + type args struct { + inputData string + srcCharset string + } + tests := []struct { + name string + args args + wantData io.Reader + expected string + wantErr bool + }{ + { + name: "should convert to expected state", + args: args{ + inputData: `феррет`, + srcCharset: "windows-1251", + }, + wantErr: false, + expected: `феррет`, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + drv := &Driver{} + + convey.Convey(tt.name, t, func() { + + data, err := ioutil.ReadAll(bytes.NewBufferString(tt.args.inputData)) + if err != nil { + panic(err) + } + + encodedData := make([]byte, len(data)*2) + + dec := charmap.Windows1251.NewEncoder() + nDst, _, err := dec.Transform(encodedData, data, false) + if err != nil { + panic(err) + } + + encodedData = encodedData[:nDst] + + gotData, err := drv.convertToUTF8(bytes.NewReader(encodedData), tt.args.srcCharset) + convey.So(err, convey.ShouldBeNil) + + outData, err := ioutil.ReadAll(gotData) + convey.So(err, convey.ShouldBeNil) + + convey.So(string(outData), convey.ShouldEqual, tt.expected) + + }) + + }) + } } diff --git a/pkg/drivers/params.go b/pkg/drivers/params.go index f85033df..61b7f244 100644 --- a/pkg/drivers/params.go +++ b/pkg/drivers/params.go @@ -31,6 +31,7 @@ type ( Cookies *HTTPCookies Headers *HTTPHeaders Viewport *Viewport + Charset string Ignore *Ignore } diff --git a/pkg/stdlib/html/document.go b/pkg/stdlib/html/document.go index a5c72fcd..524089ff 100644 --- a/pkg/stdlib/html/document.go +++ b/pkg/stdlib/html/document.go @@ -50,6 +50,7 @@ type PageLoadParams struct { // @param {Float} [params.viewport.scaleFactor] - Viewport scale factor. // @param {Boolean} [params.viewport.mobile] - Value that indicates whether to emulate mobile device. // @param {Boolean} [params.viewport.landscape] - Value that indicates whether to render a page in landscape position. +// @param {String} [params.charset] - (only HTTPDriver) Source charset content to convert UTF-8. // @return {HTMLPage} - Loaded HTML page. func Open(ctx context.Context, args ...core.Value) (core.Value, error) { err := core.ValidateArgs(args, 1, 2) @@ -215,6 +216,16 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error res.Ignore = ignore } + + charset, exists := obj.Get(values.NewString("charset")) + + if exists { + if err := core.ValidateType(charset, types.String); err != nil { + return res, err + } + + res.Charset = charset.String() + } case types.String: res.Driver = arg.(values.String).String() case types.Boolean: