added warc parser
This commit is contained in:
parent
db81015e0b
commit
f45e4a6034
8 changed files with 954 additions and 0 deletions
38
go.mod
38
go.mod
|
|
@ -1,3 +1,41 @@
|
||||||
module github.com/joe/everytab
|
module github.com/joe/everytab
|
||||||
|
|
||||||
go 1.25.9
|
go 1.25.9
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/aws/aws-sdk-go-v2 v1.41.7 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/config v1.32.17 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/credentials v1.19.16 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 // indirect
|
||||||
|
github.com/aws/smithy-go v1.25.1 // indirect
|
||||||
|
github.com/bits-and-blooms/bitset v1.24.0 // indirect
|
||||||
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
|
github.com/jackc/pgpassfile v1.0.0 // indirect
|
||||||
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
||||||
|
github.com/jackc/pgx/v5 v5.9.2 // indirect
|
||||||
|
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
||||||
|
github.com/klauspost/compress v1.18.0 // indirect
|
||||||
|
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
||||||
|
github.com/nlnwa/gowarc/v3 v3.1.0 // indirect
|
||||||
|
github.com/nlnwa/whatwg-url v0.6.2 // indirect
|
||||||
|
github.com/rivo/uniseg v0.4.7 // indirect
|
||||||
|
github.com/schollz/progressbar/v3 v3.19.0 // indirect
|
||||||
|
golang.org/x/net v0.54.0 // indirect
|
||||||
|
golang.org/x/sync v0.20.0 // indirect
|
||||||
|
golang.org/x/sys v0.44.0 // indirect
|
||||||
|
golang.org/x/term v0.43.0 // indirect
|
||||||
|
golang.org/x/text v0.37.0 // indirect
|
||||||
|
)
|
||||||
|
|
|
||||||
143
go.sum
Normal file
143
go.sum
Normal file
|
|
@ -0,0 +1,143 @@
|
||||||
|
github.com/aws/aws-sdk-go-v2 v1.41.7 h1:DWpAJt66FmnnaRIOT/8ASTucrvuDPZASqhhLey6tLY8=
|
||||||
|
github.com/aws/aws-sdk-go-v2 v1.41.7/go.mod h1:4LAfZOPHNVNQEckOACQx60Y8pSRjIkNZQz1w92xpMJc=
|
||||||
|
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 h1:gx1AwW1Iyk9Z9dD9F4akX5gnN3QZwUB20GGKH/I+Rho=
|
||||||
|
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10/go.mod h1:qqY157uZoqm5OXq/amuaBJyC9hgBCBQnsaWnPe905GY=
|
||||||
|
github.com/aws/aws-sdk-go-v2/config v1.32.17 h1:FpL4/758/diKwqbytU0prpuiu60fgXKUWCpDJtApclU=
|
||||||
|
github.com/aws/aws-sdk-go-v2/config v1.32.17/go.mod h1:OXqUMzgXytfoF9JaKkhrOYsyh72t9G+MJH8mMRaexOE=
|
||||||
|
github.com/aws/aws-sdk-go-v2/credentials v1.19.16 h1:r3RJBuU7X9ibt8RHbMjWE6y60QbKBiII6wSrXnapxSU=
|
||||||
|
github.com/aws/aws-sdk-go-v2/credentials v1.19.16/go.mod h1:6cx7zqDENJDbBIIWX6P8s0h6hqHC8Avbjh9Dseo27ug=
|
||||||
|
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 h1:UuSfcORqNSz/ey3VPRS8TcVH2Ikf0/sC+Hdj400QI6U=
|
||||||
|
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23/go.mod h1:+G/OSGiOFnSOkYloKj/9M35s74LgVAdJBSD5lsFfqKg=
|
||||||
|
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 h1:GpT/TrnBYuE5gan2cZbTtvP+JlHsutdmlV2YfEyNde0=
|
||||||
|
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23/go.mod h1:xYWD6BS9ywC5bS3sz9Xh04whO/hzK2plt2Zkyrp4JuA=
|
||||||
|
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 h1:bpd8vxhlQi2r1hiueOw02f/duEPTMK59Q4QMAoTTtTo=
|
||||||
|
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23/go.mod h1:15DfR2nw+CRHIk0tqNyifu3G1YdAOy68RftkhMDDwYk=
|
||||||
|
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 h1:OQqn11BtaYv1WLUowvcA30MpzIu8Ti4pcLPIIyoKZrA=
|
||||||
|
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24/go.mod h1:X5ZJyfwVrWA96GzPmUCWFQaEARPR7gCrpq2E92PJwAE=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 h1:FLudkZLt5ci0ozzgkVo8BJGwvqNaZbTWb3UcucAateA=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9/go.mod h1:w7wZ/s9qK7c8g4al+UyoF1Sp/Z45UwMGcqIzLWVQHWk=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 h1:ieLCO1JxUWuxTZ1cRd0GAaeX7O6cIxnwk7tc1LsQhC4=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15/go.mod h1:e3IzZvQ3kAWNykvE0Tr0RDZCMFInMvhku3qNpcIQXhM=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 h1:pbrxO/kuIwgEsOPLkaHu0O+m4fNgLU8B3vxQ+72jTPw=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23/go.mod h1:/CMNUqoj46HpS3MNRDEDIwcgEnrtZlKRaHNaHxIFpNA=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 h1:03xatSQO4+AM1lTAbnRg5OK528EUg744nW7F73U8DKw=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23/go.mod h1:M8l3mwgx5ToK7wot2sBBce/ojzgnPzZXUV445gTSyE8=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 h1:etqBTKY581iwLL/H/S2sVgk3C9lAsTJFeXWFDsDcWOU=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0/go.mod h1:L2dcoOgS2VSgbPLvpak2NyUPsO1TBN7M45Z4H7DlRc4=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 h1:TdJ+HdzOBhU8+iVAOGUTU63VXopcumCOF1paFulHWZc=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/signin v1.0.11/go.mod h1:R82ZRExE/nheo0N+T8zHPcLRTcH8MGsnR3BiVGX0TwI=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 h1:7byT8HUWrgoRp6sXjxtZwgOKfhss5fW6SkLBtqzgRoE=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/sso v1.30.17/go.mod h1:xNWknVi4Ezm1vg1QsB/5EWpAJURq22uqd38U8qKvOJc=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 h1:+1Kl1zx6bWi4X7cKi3VYh29h8BvsCoHQEQ6ST9X8w7w=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21/go.mod h1:4vIRDq+CJB2xFAXZ+YgGUTiEft7oAQlhIs71xcSeuVg=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 h1:F/M5Y9I3nwr2IEpshZgh1GeHpOItExNM9L1euNuh/fk=
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/sts v1.42.1/go.mod h1:mTNxImtovCOEEuD65mKW7DCsL+2gjEH+RPEAexAzAio=
|
||||||
|
github.com/aws/smithy-go v1.25.1 h1:J8ERsGSU7d+aCmdQur5Txg6bVoYelvQJgtZehD12GkI=
|
||||||
|
github.com/aws/smithy-go v1.25.1/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc=
|
||||||
|
github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||||
|
github.com/bits-and-blooms/bitset v1.24.0 h1:H4x4TuulnokZKvHLfzVRTHJfFfnHEeSYJizujEZvmAM=
|
||||||
|
github.com/bits-and-blooms/bitset v1.24.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||||
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||||
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
||||||
|
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
|
||||||
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
|
||||||
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
|
||||||
|
github.com/jackc/pgx/v5 v5.9.2 h1:3ZhOzMWnR4yJ+RW1XImIPsD1aNSz4T4fyP7zlQb56hw=
|
||||||
|
github.com/jackc/pgx/v5 v5.9.2/go.mod h1:mal1tBGAFfLHvZzaYh77YS/eC6IX9OWbRV1QIIM0Jn4=
|
||||||
|
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
|
||||||
|
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
|
||||||
|
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
|
||||||
|
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
|
||||||
|
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
|
||||||
|
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
|
||||||
|
github.com/nlnwa/gowarc/v3 v3.1.0 h1:XKfqqE0yoQRxlT4/6IRJRP6yzCXjKm7xIrVuEABL9Xw=
|
||||||
|
github.com/nlnwa/gowarc/v3 v3.1.0/go.mod h1:nJ3ob3Zx4lOvme06pNPzcTsLHzLAhxGRMPuiOJP/0LE=
|
||||||
|
github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q=
|
||||||
|
github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||||
|
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||||
|
github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc=
|
||||||
|
github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec=
|
||||||
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
|
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||||
|
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||||
|
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||||
|
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||||
|
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
|
||||||
|
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||||
|
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
||||||
|
golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
|
||||||
|
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||||
|
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||||
|
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||||
|
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||||
|
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||||
|
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||||
|
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||||
|
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||||
|
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||||
|
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||||
|
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
|
||||||
|
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
||||||
|
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||||
|
golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
|
||||||
|
golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w=
|
||||||
|
golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ=
|
||||||
|
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
|
||||||
|
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
|
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
|
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
|
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||||
|
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||||
|
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
|
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
|
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
|
golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
|
golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
|
||||||
|
golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
|
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
|
||||||
|
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||||
|
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||||
|
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||||
|
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
|
||||||
|
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
|
||||||
|
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||||
|
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
||||||
|
golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek=
|
||||||
|
golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4=
|
||||||
|
golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk=
|
||||||
|
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||||
|
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
|
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||||
|
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||||
|
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||||
|
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||||
|
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||||
|
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||||
|
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||||
|
golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc=
|
||||||
|
golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38=
|
||||||
|
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
|
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||||
|
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||||
|
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||||
|
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
|
||||||
|
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
|
||||||
|
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
117
pipeline/02_warc_parse/db.go
Normal file
117
pipeline/02_warc_parse/db.go
Normal file
|
|
@ -0,0 +1,117 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/jackc/pgx/v5/pgxpool"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Host represents a row from the hosts table.
|
||||||
|
type Host struct {
|
||||||
|
ID int64
|
||||||
|
Hostname string
|
||||||
|
Protocol string
|
||||||
|
WarcFilename string
|
||||||
|
WarcRecordOffset int64
|
||||||
|
WarcRecordLength int
|
||||||
|
}
|
||||||
|
|
||||||
|
// ProcessResult holds everything extracted from one host's WARC record.
|
||||||
|
type ProcessResult struct {
|
||||||
|
Title string
|
||||||
|
IframeAllowed bool
|
||||||
|
Icons []Icon
|
||||||
|
Err error
|
||||||
|
FetchErr bool // true if error was during fetch (vs parse)
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteErrors tracks errors encountered during DB writes.
|
||||||
|
type WriteErrors struct {
|
||||||
|
HostUpdate int
|
||||||
|
IconInsert int
|
||||||
|
}
|
||||||
|
|
||||||
|
// fetchBatch gets the next batch of unparsed hosts after lastID.
|
||||||
|
func fetchBatch(ctx context.Context, pool *pgxpool.Pool, lastID int64, limit int) ([]Host, error) {
|
||||||
|
rows, err := pool.Query(ctx,
|
||||||
|
`SELECT id, hostname, protocol, warc_filename, warc_record_offset, warc_record_length
|
||||||
|
FROM hosts
|
||||||
|
WHERE parsed = FALSE AND id > $1
|
||||||
|
ORDER BY id
|
||||||
|
LIMIT $2`,
|
||||||
|
lastID, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var hosts []Host
|
||||||
|
for rows.Next() {
|
||||||
|
var h Host
|
||||||
|
err := rows.Scan(&h.ID, &h.Hostname, &h.Protocol, &h.WarcFilename, &h.WarcRecordOffset, &h.WarcRecordLength)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
hosts = append(hosts, h)
|
||||||
|
}
|
||||||
|
return hosts, rows.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeResult writes parsed results back to the database.
|
||||||
|
// Returns counts of DB write errors encountered.
|
||||||
|
func writeResult(ctx context.Context, pool *pgxpool.Pool, host Host, result ProcessResult, logWriter *LogWriter) WriteErrors {
|
||||||
|
var errs WriteErrors
|
||||||
|
|
||||||
|
// Update hosts table
|
||||||
|
_, err := pool.Exec(ctx,
|
||||||
|
`UPDATE hosts SET html_title = $1, iframe_allowed = $2, parsed = TRUE WHERE id = $3`,
|
||||||
|
nilIfEmpty(result.Title), result.IframeAllowed, host.ID)
|
||||||
|
if err != nil {
|
||||||
|
errs.HostUpdate++
|
||||||
|
logLine := fmt.Sprintf("DB_ERROR: %s hosts_update: %v", host.Hostname, err)
|
||||||
|
fmt.Println(logLine)
|
||||||
|
if logWriter != nil {
|
||||||
|
logWriter.Write(logLine, true)
|
||||||
|
}
|
||||||
|
return errs
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert /favicon.ico entry
|
||||||
|
faviconURL := fmt.Sprintf("%s://%s/favicon.ico", host.Protocol, host.Hostname)
|
||||||
|
_, err = pool.Exec(ctx,
|
||||||
|
`INSERT INTO icons (host_id, url, source) VALUES ($1, $2, 'favicon_ico')`,
|
||||||
|
host.ID, faviconURL)
|
||||||
|
if err != nil {
|
||||||
|
errs.IconInsert++
|
||||||
|
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err)
|
||||||
|
fmt.Println(logLine)
|
||||||
|
if logWriter != nil {
|
||||||
|
logWriter.Write(logLine, true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert link rel="icon" entries
|
||||||
|
for _, icon := range result.Icons {
|
||||||
|
_, err = pool.Exec(ctx,
|
||||||
|
`INSERT INTO icons (host_id, url, source, rel_type, rel_sizes) VALUES ($1, $2, $3, $4, $5)`,
|
||||||
|
host.ID, icon.URL, icon.Source, nilIfEmpty(icon.RelType), nilIfEmpty(icon.RelSizes))
|
||||||
|
if err != nil {
|
||||||
|
errs.IconInsert++
|
||||||
|
logLine := fmt.Sprintf("DB_ERROR: %s icon_insert: %v", host.Hostname, err)
|
||||||
|
fmt.Println(logLine)
|
||||||
|
if logWriter != nil {
|
||||||
|
logWriter.Write(logLine, true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return errs
|
||||||
|
}
|
||||||
|
|
||||||
|
func nilIfEmpty(s string) *string {
|
||||||
|
if s == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &s
|
||||||
|
}
|
||||||
94
pipeline/02_warc_parse/log.go
Normal file
94
pipeline/02_warc_parse/log.go
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// LogWriter handles writing log lines to a file.
|
||||||
|
type LogWriter struct {
|
||||||
|
file *os.File
|
||||||
|
mu sync.Mutex
|
||||||
|
errorsOnly bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewLogWriter(path string, errorsOnly bool) (*LogWriter, error) {
|
||||||
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &LogWriter{file: f, errorsOnly: errorsOnly}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (lw *LogWriter) Write(line string, isError bool) {
|
||||||
|
if lw.errorsOnly && !isError {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
lw.mu.Lock()
|
||||||
|
defer lw.mu.Unlock()
|
||||||
|
fmt.Fprintln(lw.file, line)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (lw *LogWriter) Close() error {
|
||||||
|
return lw.file.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatLogLine creates a concise one-line log for a processed host.
|
||||||
|
func formatLogLine(host Host, result ProcessResult) string {
|
||||||
|
title := result.Title
|
||||||
|
if len(title) > 20 {
|
||||||
|
title = title[:20] + "..."
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.Err != nil {
|
||||||
|
errType := "parse"
|
||||||
|
if result.FetchErr {
|
||||||
|
errType = "fetch"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("parsed: %s err:%s %v", host.Hostname, errType, result.Err)
|
||||||
|
}
|
||||||
|
|
||||||
|
iconCount := len(result.Icons) + 1 // +1 for /favicon.ico
|
||||||
|
iframe := "iframe:ok"
|
||||||
|
if !result.IframeAllowed {
|
||||||
|
iframe = "iframe:no"
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf("parsed: %s \"%s\" icons:%d %s", host.Hostname, title, iconCount, iframe)
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeStats writes the stage stats to a JSON file.
|
||||||
|
func writeStats(stats *Stats, cfg Config) {
|
||||||
|
finishedAt := time.Now()
|
||||||
|
duration := finishedAt.Sub(stats.StartedAt)
|
||||||
|
|
||||||
|
data := map[string]interface{}{
|
||||||
|
"started_at": stats.StartedAt.Format(time.RFC3339),
|
||||||
|
"finished_at": finishedAt.Format(time.RFC3339),
|
||||||
|
"duration_seconds": int(duration.Seconds()),
|
||||||
|
"processed": stats.Processed.Load(),
|
||||||
|
"titles_found": stats.TitlesFound.Load(),
|
||||||
|
"icons_found": stats.IconsFound.Load(),
|
||||||
|
"iframe_blocked": stats.IframeBlocked.Load(),
|
||||||
|
"fetch_errors": stats.FetchErrors.Load(),
|
||||||
|
"parse_errors": stats.ParseErrors.Load(),
|
||||||
|
"db_errors": stats.DBErrors.Load(),
|
||||||
|
"panics": stats.Panics.Load(),
|
||||||
|
}
|
||||||
|
|
||||||
|
os.MkdirAll("stats", 0755)
|
||||||
|
f, err := os.Create("stats/02_warc_parse.json")
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Failed to write stats: %v\n", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
enc := json.NewEncoder(f)
|
||||||
|
enc.SetIndent("", " ")
|
||||||
|
enc.Encode(data)
|
||||||
|
fmt.Println("Stats written to stats/02_warc_parse.json")
|
||||||
|
}
|
||||||
207
pipeline/02_warc_parse/main.go
Normal file
207
pipeline/02_warc_parse/main.go
Normal file
|
|
@ -0,0 +1,207 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/jackc/pgx/v5/pgxpool"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Config struct {
|
||||||
|
DBUrl string
|
||||||
|
BatchSize int
|
||||||
|
Concurrency int
|
||||||
|
Limit int
|
||||||
|
DryRun bool
|
||||||
|
LogFile string
|
||||||
|
LogErrors bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type Stats struct {
|
||||||
|
Processed atomic.Int64
|
||||||
|
TitlesFound atomic.Int64
|
||||||
|
IconsFound atomic.Int64
|
||||||
|
IframeBlocked atomic.Int64
|
||||||
|
ParseErrors atomic.Int64
|
||||||
|
FetchErrors atomic.Int64
|
||||||
|
DBErrors atomic.Int64
|
||||||
|
Panics atomic.Int64
|
||||||
|
StartedAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
cfg := Config{}
|
||||||
|
flag.StringVar(&cfg.DBUrl, "db", "", "Postgres connection string (required)")
|
||||||
|
flag.IntVar(&cfg.BatchSize, "batch-size", 500, "Rows to fetch per batch")
|
||||||
|
flag.IntVar(&cfg.Concurrency, "concurrency", 100, "Number of concurrent goroutines")
|
||||||
|
flag.IntVar(&cfg.Limit, "limit", 0, "Max rows to process (0 = all)")
|
||||||
|
flag.BoolVar(&cfg.DryRun, "dry-run", false, "Print results without writing to DB")
|
||||||
|
flag.StringVar(&cfg.LogFile, "log-file", "", "Mirror log lines to this file")
|
||||||
|
flag.BoolVar(&cfg.LogErrors, "log-errors-only", false, "Only write errors to log file")
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if cfg.DBUrl == "" {
|
||||||
|
fmt.Println("Usage: warc_parse --db DATABASE_URL [OPTIONS]")
|
||||||
|
flag.PrintDefaults()
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Init S3 client
|
||||||
|
if err := initS3(); err != nil {
|
||||||
|
log.Fatalf("Failed to init S3: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
pool, err := pgxpool.New(ctx, cfg.DBUrl)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to connect to database: %v", err)
|
||||||
|
}
|
||||||
|
defer pool.Close()
|
||||||
|
|
||||||
|
// Get total count
|
||||||
|
var total int64
|
||||||
|
if cfg.Limit > 0 {
|
||||||
|
total = int64(cfg.Limit)
|
||||||
|
} else {
|
||||||
|
err = pool.QueryRow(ctx, "SELECT COUNT(*) FROM hosts WHERE parsed = FALSE").Scan(&total)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to count unparsed hosts: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if total == 0 {
|
||||||
|
fmt.Println("No unparsed hosts found.")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("=== WARC Parser ===\n")
|
||||||
|
fmt.Printf("Unparsed hosts: %d\n", total)
|
||||||
|
fmt.Printf("Concurrency: %d\n", cfg.Concurrency)
|
||||||
|
fmt.Printf("Batch size: %d\n", cfg.BatchSize)
|
||||||
|
fmt.Printf("Dry run: %v\n\n", cfg.DryRun)
|
||||||
|
|
||||||
|
// Setup log file
|
||||||
|
var logWriter *LogWriter
|
||||||
|
if cfg.LogFile != "" {
|
||||||
|
logWriter, err = NewLogWriter(cfg.LogFile, cfg.LogErrors)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to open log file: %v", err)
|
||||||
|
}
|
||||||
|
defer logWriter.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
stats := &Stats{StartedAt: time.Now()}
|
||||||
|
|
||||||
|
|
||||||
|
// Worker pool
|
||||||
|
sem := make(chan struct{}, cfg.Concurrency)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
// Process in batches
|
||||||
|
var lastID int64
|
||||||
|
processed := 0
|
||||||
|
|
||||||
|
for {
|
||||||
|
if cfg.Limit > 0 && processed >= cfg.Limit {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
batchLimit := cfg.BatchSize
|
||||||
|
if cfg.Limit > 0 && processed+batchLimit > cfg.Limit {
|
||||||
|
batchLimit = cfg.Limit - processed
|
||||||
|
}
|
||||||
|
|
||||||
|
hosts, err := fetchBatch(ctx, pool, lastID, batchLimit)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to fetch batch: %v", err)
|
||||||
|
}
|
||||||
|
if len(hosts) == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
lastID = hosts[len(hosts)-1].ID
|
||||||
|
|
||||||
|
for i := range hosts {
|
||||||
|
host := hosts[i]
|
||||||
|
wg.Add(1)
|
||||||
|
sem <- struct{}{}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
defer func() { <-sem }()
|
||||||
|
|
||||||
|
// Recover from panics — log them, don't mark row as parsed
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
stats.Panics.Add(1)
|
||||||
|
stats.Processed.Add(1)
|
||||||
|
logLine := fmt.Sprintf("PANIC: %s %v", host.Hostname, r)
|
||||||
|
fmt.Println(logLine)
|
||||||
|
if logWriter != nil {
|
||||||
|
logWriter.Write(logLine, true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
result := processHost(host)
|
||||||
|
|
||||||
|
// Log line
|
||||||
|
logLine := formatLogLine(host, result)
|
||||||
|
fmt.Println(logLine)
|
||||||
|
if logWriter != nil {
|
||||||
|
logWriter.Write(logLine, result.Err != nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write to DB
|
||||||
|
if !cfg.DryRun && result.Err == nil {
|
||||||
|
errs := writeResult(ctx, pool, host, result, logWriter)
|
||||||
|
stats.DBErrors.Add(int64(errs.HostUpdate + errs.IconInsert))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update stats
|
||||||
|
stats.Processed.Add(1)
|
||||||
|
if result.Title != "" {
|
||||||
|
stats.TitlesFound.Add(1)
|
||||||
|
}
|
||||||
|
stats.IconsFound.Add(int64(len(result.Icons)))
|
||||||
|
if !result.IframeAllowed {
|
||||||
|
stats.IframeBlocked.Add(1)
|
||||||
|
}
|
||||||
|
if result.Err != nil {
|
||||||
|
if result.FetchErr {
|
||||||
|
stats.FetchErrors.Add(1)
|
||||||
|
} else {
|
||||||
|
stats.ParseErrors.Add(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
processed += len(hosts)
|
||||||
|
}
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
// Print summary
|
||||||
|
duration := time.Since(stats.StartedAt)
|
||||||
|
fmt.Printf("\n=== Summary ===\n")
|
||||||
|
fmt.Printf("Duration: %s\n", duration.Round(time.Second))
|
||||||
|
fmt.Printf("Processed: %d\n", stats.Processed.Load())
|
||||||
|
fmt.Printf("Titles found: %d\n", stats.TitlesFound.Load())
|
||||||
|
fmt.Printf("Icons found: %d\n", stats.IconsFound.Load())
|
||||||
|
fmt.Printf("Iframe blocked: %d\n", stats.IframeBlocked.Load())
|
||||||
|
fmt.Printf("Fetch errors: %d\n", stats.FetchErrors.Load())
|
||||||
|
fmt.Printf("Parse errors: %d\n", stats.ParseErrors.Load())
|
||||||
|
fmt.Printf("DB errors: %d\n", stats.DBErrors.Load())
|
||||||
|
fmt.Printf("Panics: %d\n", stats.Panics.Load())
|
||||||
|
|
||||||
|
// Write stats JSON
|
||||||
|
writeStats(stats, cfg)
|
||||||
|
}
|
||||||
175
pipeline/02_warc_parse/parser.go
Normal file
175
pipeline/02_warc_parse/parser.go
Normal file
|
|
@ -0,0 +1,175 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Icon represents a discovered favicon link.
|
||||||
|
type Icon struct {
|
||||||
|
URL string
|
||||||
|
Source string // "favicon_ico" or "link_rel"
|
||||||
|
RelType string // type attribute from <link> (e.g., "image/png")
|
||||||
|
RelSizes string // sizes attribute from <link> (e.g., "32x32")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseResult holds extracted data from HTML parsing.
|
||||||
|
type ParseResult struct {
|
||||||
|
Title string
|
||||||
|
Icons []Icon
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseHTML extracts the title and link rel="icon" tags from HTML.
|
||||||
|
// Uses a lenient tokenizer approach that handles malformed HTML.
|
||||||
|
func ParseHTML(body []byte, protocol, hostname string) ParseResult {
|
||||||
|
result := ParseResult{}
|
||||||
|
tokenizer := html.NewTokenizer(strings.NewReader(string(body)))
|
||||||
|
|
||||||
|
inTitle := false
|
||||||
|
var titleBuilder strings.Builder
|
||||||
|
|
||||||
|
for {
|
||||||
|
tt := tokenizer.Next()
|
||||||
|
switch tt {
|
||||||
|
case html.ErrorToken:
|
||||||
|
// End of document or parse error — return what we have
|
||||||
|
result.Title = cleanTitle(titleBuilder.String())
|
||||||
|
return result
|
||||||
|
|
||||||
|
case html.StartTagToken, html.SelfClosingTagToken:
|
||||||
|
tn, hasAttr := tokenizer.TagName()
|
||||||
|
tagName := string(tn)
|
||||||
|
|
||||||
|
if tagName == "title" && tt == html.StartTagToken {
|
||||||
|
inTitle = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if tagName == "link" && hasAttr {
|
||||||
|
icon := parseLinkTag(tokenizer, protocol, hostname)
|
||||||
|
if icon != nil {
|
||||||
|
result.Icons = append(result.Icons, *icon)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop parsing after </head> to save time — icons and title are in <head>
|
||||||
|
if tagName == "body" {
|
||||||
|
result.Title = cleanTitle(titleBuilder.String())
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
case html.EndTagToken:
|
||||||
|
tn, _ := tokenizer.TagName()
|
||||||
|
if string(tn) == "title" {
|
||||||
|
inTitle = false
|
||||||
|
}
|
||||||
|
|
||||||
|
case html.TextToken:
|
||||||
|
if inTitle {
|
||||||
|
titleBuilder.Write(tokenizer.Text())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseLinkTag extracts icon info from a <link> tag if it's a favicon.
|
||||||
|
func parseLinkTag(tokenizer *html.Tokenizer, protocol, hostname string) *Icon {
|
||||||
|
var rel, href, typ, sizes string
|
||||||
|
|
||||||
|
for {
|
||||||
|
key, val, more := tokenizer.TagAttr()
|
||||||
|
k := string(key)
|
||||||
|
v := string(val)
|
||||||
|
|
||||||
|
switch k {
|
||||||
|
case "rel":
|
||||||
|
rel = strings.ToLower(v)
|
||||||
|
case "href":
|
||||||
|
href = v
|
||||||
|
case "type":
|
||||||
|
typ = strings.ToLower(v)
|
||||||
|
case "sizes":
|
||||||
|
sizes = strings.ToLower(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !more {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only interested in icon links
|
||||||
|
if !strings.Contains(rel, "icon") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if href == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve relative URLs
|
||||||
|
resolvedURL := resolveURL(href, protocol, hostname)
|
||||||
|
if resolvedURL == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Icon{
|
||||||
|
URL: resolvedURL,
|
||||||
|
Source: "link_rel",
|
||||||
|
RelType: typ,
|
||||||
|
RelSizes: sizes,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// resolveURL resolves a potentially relative icon URL against the host's base URL.
|
||||||
|
func resolveURL(href, protocol, hostname string) string {
|
||||||
|
href = strings.TrimSpace(href)
|
||||||
|
if href == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip data: URIs
|
||||||
|
if strings.HasPrefix(href, "data:") {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Already absolute
|
||||||
|
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||||
|
return href
|
||||||
|
}
|
||||||
|
|
||||||
|
// Protocol-relative
|
||||||
|
if strings.HasPrefix(href, "//") {
|
||||||
|
return protocol + ":" + href
|
||||||
|
}
|
||||||
|
|
||||||
|
// Relative to root
|
||||||
|
base := protocol + "://" + hostname
|
||||||
|
if strings.HasPrefix(href, "/") {
|
||||||
|
return base + href
|
||||||
|
}
|
||||||
|
|
||||||
|
// Relative path — resolve against root
|
||||||
|
parsed, err := url.Parse(href)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
baseParsed, err := url.Parse(base + "/")
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return baseParsed.ResolveReference(parsed).String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanTitle trims whitespace and truncates to 512 chars.
|
||||||
|
func cleanTitle(s string) string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
// Collapse internal whitespace
|
||||||
|
fields := strings.Fields(s)
|
||||||
|
s = strings.Join(fields, " ")
|
||||||
|
if len(s) > 512 {
|
||||||
|
s = s[:512]
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
54
pipeline/02_warc_parse/process.go
Normal file
54
pipeline/02_warc_parse/process.go
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html/charset"
|
||||||
|
"golang.org/x/text/transform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// processHost fetches and parses a single host's WARC record.
|
||||||
|
func processHost(host Host) ProcessResult {
|
||||||
|
warcResult, err := FetchAndParseWARC(host.WarcFilename, host.WarcRecordOffset, int64(host.WarcRecordLength))
|
||||||
|
if err != nil {
|
||||||
|
return ProcessResult{Err: err, FetchErr: true}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check iframe headers
|
||||||
|
iframeAllowed := CheckIframeAllowed(warcResult.HTTPHeaders)
|
||||||
|
|
||||||
|
// Convert body to UTF-8 based on Content-Type header and HTML meta
|
||||||
|
contentType := warcResult.HTTPHeaders.Get("Content-Type")
|
||||||
|
body := toUTF8(warcResult.Body, contentType)
|
||||||
|
|
||||||
|
// Parse HTML for title and icons
|
||||||
|
parsed := ParseHTML(body, host.Protocol, host.Hostname)
|
||||||
|
|
||||||
|
// Sanitize title — strip any remaining invalid UTF-8 bytes
|
||||||
|
// (handles pages that lie about encoding or have truncated sequences)
|
||||||
|
title := strings.ToValidUTF8(parsed.Title, "")
|
||||||
|
|
||||||
|
return ProcessResult{
|
||||||
|
Title: title,
|
||||||
|
IframeAllowed: iframeAllowed,
|
||||||
|
Icons: parsed.Icons,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// toUTF8 detects the encoding of the HTML body and converts to UTF-8.
|
||||||
|
func toUTF8(body []byte, contentType string) []byte {
|
||||||
|
// DetermineEncoding checks Content-Type header and <meta> tags
|
||||||
|
encoding, _, _ := charset.DetermineEncoding(body, contentType)
|
||||||
|
if encoding == nil {
|
||||||
|
return body
|
||||||
|
}
|
||||||
|
|
||||||
|
reader := transform.NewReader(bytes.NewReader(body), encoding.NewDecoder())
|
||||||
|
utf8Body, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
return body
|
||||||
|
}
|
||||||
|
return utf8Body
|
||||||
|
}
|
||||||
126
pipeline/02_warc_parse/warc.go
Normal file
126
pipeline/02_warc_parse/warc.go
Normal file
|
|
@ -0,0 +1,126 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"compress/gzip"
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/aws/aws-sdk-go-v2/aws"
|
||||||
|
"github.com/aws/aws-sdk-go-v2/config"
|
||||||
|
"github.com/aws/aws-sdk-go-v2/service/s3"
|
||||||
|
"github.com/nlnwa/gowarc/v3"
|
||||||
|
)
|
||||||
|
|
||||||
|
const ccBucket = "commoncrawl"
|
||||||
|
|
||||||
|
var s3Client *s3.Client
|
||||||
|
|
||||||
|
func initS3() error {
|
||||||
|
cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion("us-east-1"))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("load AWS config: %w", err)
|
||||||
|
}
|
||||||
|
s3Client = s3.NewFromConfig(cfg)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// WARCResult holds the extracted data from a WARC response record.
|
||||||
|
type WARCResult struct {
|
||||||
|
HTTPHeaders http.Header
|
||||||
|
Body []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
// FetchAndParseWARC fetches a WARC record via S3 byte-range request and parses it.
|
||||||
|
func FetchAndParseWARC(warcFilename string, offset, length int64) (*WARCResult, error) {
|
||||||
|
rangeHeader := fmt.Sprintf("bytes=%d-%d", offset, offset+length-1)
|
||||||
|
|
||||||
|
resp, err := s3Client.GetObject(context.Background(), &s3.GetObjectInput{
|
||||||
|
Bucket: aws.String(ccBucket),
|
||||||
|
Key: aws.String(warcFilename),
|
||||||
|
Range: aws.String(rangeHeader),
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("s3 get: %w", err)
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
// Each WARC record is individually gzipped
|
||||||
|
gzReader, err := gzip.NewReader(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("gzip: %w", err)
|
||||||
|
}
|
||||||
|
defer gzReader.Close()
|
||||||
|
|
||||||
|
// Read all decompressed data into memory for gowarc
|
||||||
|
decompressed, err := io.ReadAll(gzReader)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("decompress: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse WARC record using gowarc
|
||||||
|
warcReader, err := gowarc.NewWarcFileReaderFromStream(bytes.NewReader(decompressed), 0)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("warc reader: %w", err)
|
||||||
|
}
|
||||||
|
defer warcReader.Close()
|
||||||
|
|
||||||
|
rec, err := warcReader.Next()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("read record: %w", err)
|
||||||
|
}
|
||||||
|
defer rec.Close()
|
||||||
|
|
||||||
|
if rec.WarcRecord.Type() != gowarc.Response {
|
||||||
|
return nil, fmt.Errorf("unexpected record type: %s", rec.WarcRecord.Type())
|
||||||
|
}
|
||||||
|
|
||||||
|
block := rec.WarcRecord.Block()
|
||||||
|
httpBlock, ok := block.(gowarc.HttpResponseBlock)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("block is not HTTP response")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get HTTP response headers
|
||||||
|
var httpHeaders http.Header
|
||||||
|
headers := httpBlock.HttpHeader()
|
||||||
|
if headers != nil {
|
||||||
|
httpHeaders = *headers
|
||||||
|
} else {
|
||||||
|
httpHeaders = make(http.Header)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get HTTP body (the HTML)
|
||||||
|
bodyReader, err := httpBlock.PayloadBytes()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("payload: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err := io.ReadAll(bodyReader)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("read body: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &WARCResult{
|
||||||
|
HTTPHeaders: httpHeaders,
|
||||||
|
Body: body,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CheckIframeAllowed checks HTTP response headers for X-Frame-Options and CSP frame-ancestors.
|
||||||
|
func CheckIframeAllowed(headers http.Header) bool {
|
||||||
|
xfo := strings.ToLower(headers.Get("X-Frame-Options"))
|
||||||
|
if xfo == "deny" || xfo == "sameorigin" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
csp := strings.ToLower(headers.Get("Content-Security-Policy"))
|
||||||
|
if strings.Contains(csp, "frame-ancestors") && !strings.Contains(csp, "frame-ancestors *") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue