-
Notifications
You must be signed in to change notification settings - Fork 442
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
unable to crawl: zlib compress failed #179
Comments
Appears to be fixed by acidburn0zzz@e4323f3 |
Here is refreshed patch: From e4323f397ff14302dcdeb9fb463ed5a8de1316f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ivan=20Skytte=20J=C3=B8rgensen?= <[email protected]>
Date: Tue, 5 Apr 2016 16:39:35 +0200
Subject: [PATCH] Removed global #pragma pack(4) from <types.h>
The global setting was causing fragility other places requiring include order
to be specific. Remoed the global "#pragma pack(4)" and replaced it with the
equally ugly "__attribute__((packed, aligned(4)))", but at least it is explicit
and doesn't interfere with system and 3rd-party header files and libraries.
--- a/Collectiondb.h
+++ b/Collectiondb.h
@@ -303,9 +303,9 @@
void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
//bool print (class SafeBuf *sb ) ;
//bool setFromSafeBuf (class SafeBuf *sb ) ;
-};
+} __attribute__((packed, aligned(4)));
class CollectionRec {
--- a/Images.h
+++ b/Images.h
@@ -35,9 +35,9 @@
bool printLink ,
int32_t *newdx ,
char *style = NULL ,
char format = FORMAT_HTML ) ;
-};
+} __attribute__((packed, aligned(4)));
// XmlDoc::ptr_imgData is a ThumbnailArray
class ThumbnailArray {
public:
@@ -59,9 +59,9 @@
p += ti->getSize();
}
return NULL;
};
-};
+} __attribute__((packed, aligned(4)));
class Images {
public:
--- a/Linkdb.h
+++ b/Linkdb.h
@@ -833,9 +833,9 @@
//int32_t m_siteNumInlinks;
// serialize "Inlinks" into this buffer, m_buf[]
char m_buf[0];
-};
+} __attribute__((packed, aligned(4)));
#define MAXINLINKSTRINGBUFSIZE 2048
@@ -1071,9 +1071,9 @@
int32_t size_templateVector ;
char m_buf[MAXINLINKSTRINGBUFSIZE] ;
-};
+} __attribute__((packed, aligned(4)));
// . this function is normally called like "info = makeLinkInfo()"
// to create a new LinkInfo based on a bunch of Msg20 replies
// . returns NULL and sets g_errno on error
--- a/Sections.h
+++ b/Sections.h
@@ -1051,9 +1051,9 @@
// . this is not used for tags of type contenthash or taghash
// . seems like pastdate and futuredate and eurdatefmt
// are the only vote types that actually really use this...
float m_numSampled;
-};
+} __attribute__((packed, aligned(4)));
class SectionVotingTable {
public:
--- a/Spider.h
+++ b/Spider.h
@@ -813,9 +813,9 @@
bool setFromAddUrl ( char *url ) ;
bool setFromInject ( char *url ) ;
bool isCorrupt ( );
-};
+} __attribute__((packed, aligned(4)));
// . XmlDoc adds this record to spiderdb after attempting to spider a url
// supplied to it by a SpiderRequest
// . before adding a SpiderRequest to the spider cache, we scan through
@@ -963,9 +963,9 @@
int32_t print ( class SafeBuf *sbarg );
int64_t getUrlHash48 () {return g_spiderdb.getUrlHash48(&m_key); };
int64_t getParentDocId (){return g_spiderdb.getParentDocId(&m_key);};
-};
+} __attribute__((packed, aligned(4)));
// are we responsible for this ip?
bool isAssignedToUs ( int32_t firstIp ) ;
--- a/SpiderProxy.cpp
+++ b/SpiderProxy.cpp
@@ -57,9 +57,9 @@
int32_t m_proxyPort;
// id of this loadbucket in case same host is using the same
// proxy to download the same urlip
int32_t m_id;
-};
+} __attribute__((packed, aligned(4)));
// . similar to s_ipTable but maps a URL's ip to a LoadBucket
// . every download request in the last 10 minutes is represented by one
// LoadBucket
--- a/Statsdb.h
+++ b/Statsdb.h
@@ -185,9 +185,9 @@
uint32_t m_zero;
uint32_t m_labelHash;
// force to 32-bit even though time_t is 64-bit on 64-bit systems
int32_t m_time1;
-};
+} __attribute__((packed, aligned(4)));
class StatData {
public:
float m_totalOps;
@@ -205,7 +205,7 @@
// m_key.n1 = t1; m_key.n0 = labelHash; };
//int32_t getLabelHash () { return (int32_t)m_labelHash; };
//int32_t getParmHash () { return (int32_t)m_labelHash; };
//int32_t getTime1 () { return m_time1; };
-};
+} __attribute__((packed, aligned(4)));
#endif
--- a/Tagdb.h
+++ b/Tagdb.h
@@ -99,9 +99,9 @@
//char m_user[8];
int32_t m_bufSize;
char m_buf[0];
-};
+} __attribute__((packed, aligned(4)));
// . convert "domain_squatter" to ST_DOMAIN_SQUATTER
// . used by CollectionRec::getRegExpNum()
int32_t getTagTypeFromStr( char *tagTypeName , int32_t tagnameLen = -1 );
--- a/types.h
+++ b/types.h
@@ -31,10 +31,8 @@
#define uint96_t u_int96_t
#define key128_t u_int128_t
#define uint128_t u_int128_t
-#pragma pack(4)
-
class u_int96_t {
public:
// it's little endian
@@ -193,10 +191,9 @@
// TODO: should we fix this?
int32_t operator % ( uint32_t mod ) {
return n0 % mod; };
-};
-//__attribute__((packed));
+} __attribute__((packed, aligned(4)));
class u_int128_t {
public:
@@ -318,9 +315,9 @@
// TODO: should we fix this?
int32_t operator % ( uint32_t mod ) {
return n0 % mod; };
-};
+} __attribute__((packed, aligned(4)));
// used only by m_orderTree in Spider.cpp for RdbTree.cpp
class key192_t {
public:
@@ -368,9 +365,9 @@
n2=0xffffffffffffffffLL;
};
-};
+} __attribute__((packed, aligned(4)));
class key224_t {
public:
// k0 is the LEAST significant int32_t
@@ -428,11 +425,9 @@
n3=0xffffffffffffffffLL;
};
-};
-
-#pragma pack(2)
+} __attribute__((packed, aligned(4)));
class key144_t {
public:
// it's little endian
@@ -476,11 +471,10 @@
n2=0xffffffffffffffffLL;
};
-};
+} __attribute__((packed, aligned(2)));
-#pragma pack(4)
// handy quicky functions
inline char KEYCMP ( char *k1, int32_t a, char *k2, int32_t b , char keySize ) {
// posdb With the above patch applied, I was able to add URL successfully but attempt to search segfaulted: 1620538258007 000 http: May 09 15:30:58 127.0.0.1 GET /search?c=main&q=search HTTP/1.1 Host: localhost:8000 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 Accept-Language: en-AU,en;q=0.7,en-US;q=0.3 Accept-Encoding: gzip, deflate DNT: 1 Connection: keep-alive Cookie: EndecaPermanent_LastApplicationSelected=tst-nz; ESESSIONID=7FDF62CEE80D4D61CAB963DC04AF9708 Upgrade-Insecure-Requests: 1
1620538258044 000 thread: Using 36708352 bytes for 20 thread stacks.
[New Thread 0x7fff6cb3f700 (LWP 5236)]
[New Thread 0x7fff6cb3f700 (LWP 5237)]
[Thread 0x7fff6cb3f700 (LWP 5236) exited]
[Thread 0x7fff6cb3f700 (LWP 5237) exited]
[New Thread 0x7fff6cb3f700 (LWP 5238)]
[Thread 0x7fff6cb3f700 (LWP 5238) exited]
[New Thread 0x7fff6cb3f700 (LWP 5239)]
[New Thread 0x7fff6cb3f700 (LWP 5240)]
[Thread 0x7fff6cb3f700 (LWP 5239) exited]
[Thread 0x7fff6cb3f700 (LWP 5240) exited]
1620538258077 000 xmldoc: deserialize msg20 reply corruption error
1620538258077 000 xmldoc: DO YOU NEED TO NUKE CACHEDB.DAT?????
1620538258077 000 query: queryLang is 0 for q=search
Thread 1 "gb" received signal SIGSEGV, Segmentation fault.
0x0000555555622bc5 in printInlinkText (sb=0x55555abffb20, mr=0x55555b9153a4, si=0x55555abffb48, numPrinted=0x7ffffffa667c) at PageResults.cpp:3698
3698 if ( info && mr->size_linkInfo!=info->m_lisize ){char *xx=NULL;*xx=0; }
(gdb) bt
#0 0x0000555555622bc5 in printInlinkText (sb=0x55555abffb20, mr=0x55555b9153a4, si=0x55555abffb48, numPrinted=0x7ffffffa667c) at PageResults.cpp:3698
#1 0x00005555556280c5 in printResult (st=0x55555abffb20, ix=0, numPrintedSoFar=0x7fffffffc75c) at PageResults.cpp:5925
#2 0x000055555561e943 in gotResults (state=0x55555abffb20) at PageResults.cpp:1602
#3 0x000055555561d5fe in gotState (state=0x55555abffb20) at PageResults.cpp:789
#4 0x000055555561d5a5 in gotResultsWrapper (state=0x55555abffb20) at PageResults.cpp:766
#5 0x000055555585f974 in gotSummaryWrapper (state=0x55555ac18e18) at Msg40.cpp:1794
#6 0x000055555574bba5 in gotReplyWrapper20 (state=0x55555b91505c, state2=0x0) at Msg20.cpp:305
#7 0x000055555573cd20 in Multicast::closeUpShop (this=0x55555b915094, slot=0x7fff71cdb514) at Multicast.cpp:1559
#8 0x000055555573ca74 in Multicast::gotReply1 (this=0x55555b915094, slot=0x7fff71cdb514) at Multicast.cpp:1466
#9 0x000055555573c47b in gotReplyWrapperM1 (state=0x55555b915094, slot=0x7fff71cdb514) at Multicast.cpp:1292
#10 0x00005555556d3dc5 in UdpServer::makeCallback_ass (this=0x55555608fdc0 <g_udpServer>, slot=0x7fff71cdb514) at UdpServer.cpp:2425
#11 0x00005555556d37b0 in UdpServer::makeCallbacks_ass (this=0x55555608fdc0 <g_udpServer>, niceness=0) at UdpServer.cpp:2167
#12 0x00005555556d1fd3 in UdpServer::process_ass (this=0x55555608fdc0 <g_udpServer>, now=1620538258058, maxNiceness=100) at UdpServer.cpp:1150
#13 0x00005555556d207d in readPollWrapper_ass (fd=3, state=0x55555608fdc0 <g_udpServer>) at UdpServer.cpp:1191
#14 0x00005555557599d8 in Loop::callCallbacks_ass (this=0x55555612ff80 <g_loop>, forReading=true, fd=3, now=1620538258058, niceness=0) at Loop.cpp:536
#15 0x000055555575af13 in Loop::doPoll (this=0x55555612ff80 <g_loop>) at Loop.cpp:2133
#16 0x000055555575aa88 in Loop::runLoop (this=0x55555612ff80 <g_loop>) at Loop.cpp:1425
#17 0x00005555555e5a02 in main2 (argc=1, argv=0x7fffffffdc98) at main.cpp:4091
#18 0x00005555555deeb5 in main (argc=1, argv=0x7fffffffdc98) at main.cpp:399 |
Related to search segfault: acidburn0zzz@feac1e7 --- a/PageResults.cpp
+++ b/PageResults.cpp
@@ -3693,10 +3693,13 @@
// from the TitleRec, which is much faster but more stale.
// . "&inlinks=1" is slow and fresh, "&inlinks=2" is fast
// and stale. Both are really only for BuzzLogic.
LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks;
- // sanity
- if ( info && mr->size_linkInfo!=info->m_lisize ){char *xx=NULL;*xx=0; }
+ // Corrupted linkinfo has been seen. In that case just log an error and do not try to print any link texts
+ if(info && mr->size_linkInfo!=info->m_lisize) {
+ log("ERROR results: mr->size_linkInfo(%d) != info->m_lisize (%d)", mr->size_linkInfo, info->m_lisize);
+ return false;
+ }
// NULLify if empty
if ( mr->size_linkInfo <= 0 ) info = NULL;
// do not both if none
if ( info && ! info->m_numStoredInlinks ) info = NULL; |
onlyjob
added a commit
to onlyjob/open-source-search-engine
that referenced
this issue
Jun 18, 2021
…ry-picked commits. (Closes: gigablast#86, gigablast#179)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
After successfully adding a web site for a crawl I'm getting the following error:
Apparently problem is somewhere in
XmlDoc.cpp
around line 35540:The text was updated successfully, but these errors were encountered: