Skip to content

Commit

Permalink
Merge pull request byzer-org#591 from wuheyi/regex_user
Browse files Browse the repository at this point in the history
[update] 去除新版app自带的用户标签
  • Loading branch information
cfmcgrady authored Oct 16, 2018
2 parents 0c147ac + f7d3a56 commit 09ae611
Showing 1 changed file with 9 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ class SQLFeatureExtractInPlace extends SQLAlg with Functions {
})

def cleanDoc = F.udf((doc: String) => {
/**
* 去除新版app内自带的用户标签
*/
val regEx_user = """<div class="quote"><blockquote><b>[\s\S]*?</b><br>"""
val p_user = Pattern.compile(regEx_user, Pattern.CASE_INSENSITIVE)
val m_user = p_user.matcher(doc)
var htmlStr = m_user.replaceAll("")
/**
* 去除html标签
*/
Expand All @@ -97,8 +104,8 @@ class SQLFeatureExtractInPlace extends SQLAlg with Functions {
val regEx_html = "<[^>]+>"
// 过滤script标签
val p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE)
val m_script = p_script.matcher(doc)
var htmlStr = m_script.replaceAll("")
val m_script = p_script.matcher(htmlStr)
htmlStr = m_script.replaceAll("")
// 过滤style标签
val p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE)
val m_style = p_style.matcher(htmlStr)
Expand Down

0 comments on commit 09ae611

Please sign in to comment.