MDL-31928: Fixing bugs in repository_url

- if the same image occurs several times on the page list it only once - resolve image path correctly if it has a query string - show images included in CSS - non-JS file picker ignores thumbnail width and height attributes (this is a temp fix until renderers are implemented)
hello-josh · Apr 23, 2012 · 8685679 · 8685679
1 parent ead4f18
commit 8685679
Showing 3 changed files with 168 additions and 47 deletions.
diff --git a/repository/filepicker.php b/repository/filepicker.php
@@ -139,7 +139,15 @@
         echo '<table>';
         foreach ($search_result['list'] as $item) {
             echo '<tr>';
-            echo '<td><img src="'.$item['thumbnail'].'" />';
+            echo '<td>';
+            $style = '';
+            if (isset($item['thumbnail_height'])) {
+                $style .= 'max-height:'.$item['thumbnail_height'].'px;';
+            }
+            if (isset($item['thumbnail_width'])) {
+                $style .= 'max-width:'.$item['thumbnail_width'].'px;';
+            }
+            echo html_writer::empty_tag('img', array('src' => $item['thumbnail'], 'style' => $style));
             echo '</td><td>';
             if (!empty($item['url'])) {
                 echo html_writer::link($item['url'], $item['title'], array('target'=>'_blank'));
@@ -227,7 +235,15 @@
             echo '<table>';
             foreach ($list['list'] as $item) {
                 echo '<tr>';
-                echo '<td><img src="'.$item['thumbnail'].'" />';
+                echo '<td>';
+                $style = '';
+                if (isset($item['thumbnail_height'])) {
+                    $style .= 'max-height:'.$item['thumbnail_height'].'px;';
+                }
+                if (isset($item['thumbnail_width'])) {
+                    $style .= 'max-width:'.$item['thumbnail_width'].'px;';
+                }
+                echo html_writer::empty_tag('img', array('src' => $item['thumbnail'], 'style' => $style));
                 echo '</td><td>';
                 if (!empty($item['url'])) {
                     echo html_writer::link($item['url'], $item['title'], array('target'=>'_blank'));

diff --git a/repository/url/lib.php b/repository/url/lib.php
@@ -30,6 +30,7 @@
 require_once(dirname(__FILE__).'/locallib.php');
 
 class repository_url extends repository {
+    var $processedfiles = array();
 
     /**
      * @param int $repositoryid
@@ -42,16 +43,6 @@ public function __construct($repositoryid, $context = SYSCONTEXTID, $options = a
         $this->file_url = optional_param('file', '', PARAM_RAW);
     }
 
-    public function get_file($url, $file = '') {
-        global $CFG;
-        //$CFG->repository_no_delete = true;
-        $path = $this->prepare_file($file);
-        $fp = fopen($path, 'w');
-        $c = new curl;
-        $c->download(array(array('url'=>$url, 'file'=>$fp)));
-        return array('path'=>$path, 'url'=>$url);
-    }
-
     public function check_login() {
         if (!empty($this->file_url)) {
             return true;
@@ -75,6 +66,7 @@ public function print_login() {
 
             $ret['login'] = array($url);
             $ret['login_btn_label'] = get_string('download', 'repository_url');
+            $ret['allowcaching'] = true; // indicates that login form can be cached in filepicker.js
             return $ret;
         } else {
             echo <<<EOD
@@ -97,48 +89,113 @@ public function print_login() {
     public function get_listing($path='', $page='') {
         global $CFG, $OUTPUT;
         $ret = array();
+        $ret['list'] = array();
+        $ret['nosearch'] = true;
+        $ret['norefresh'] = true;
+        $ret['nologin'] = true;
+
+        $this->parse_file(null, $this->file_url, $ret, true);
+        return $ret;
+    }
+
+    /**
+     * Parses one file (either html or css)
+     *
+     * @param string $baseurl (optional) URL of the file where link to this file was found
+     * @param string $relativeurl relative or absolute link to the file
+     * @param array $list
+     * @param bool $mainfile true only for main HTML false and false for all embedded/linked files
+     */
+    protected function parse_file($baseurl, $relativeurl, &$list, $mainfile = false) {
+        if (preg_match('/([\'"])(.*)\1/', $relativeurl, $matches)) {
+            $relativeurl = $matches[2];
+        }
+        if (empty($baseurl)) {
+            $url = $relativeurl;
+        } else {
+            $url = htmlspecialchars_decode(url_to_absolute($baseurl, $relativeurl));
+        }
+        if (in_array($url, $this->processedfiles)) {
+            // avoid endless recursion
+            return;
+        }
+        $this->processedfiles[] = $url;
         $curl = new curl;
-        $msg = $curl->head($this->file_url);
+        $msg = $curl->head($url);
         $info = $curl->get_info();
         if ($info['http_code'] != 200) {
-            $ret['e'] = $msg;
+            if ($mainfile) {
+                $list['error'] = $msg;
+            }
         } else {
-            $ret['list'] = array();
-            $ret['nosearch'] = true;
-            $ret['nologin'] = true;
-            $filename = $this->guess_filename($info['url'], $info['content_type']);
-            if (strstr($info['content_type'], 'text/html') || empty($info['content_type'])) {
-                // analysis this web page, general file list
-                $ret['list'] = array();
-                $content = $curl->get($info['url']);
-                $this->analyse_page($info['url'], $content, $ret);
-            } else {
+            $csstoanalyze = '';
+            if ($mainfile && (strstr($info['content_type'], 'text/html') || empty($info['content_type']))) {
+                // parse as html
+                $htmlcontent = $curl->get($info['url']);
+                $ddoc = new DOMDocument();
+                @$ddoc->loadHTML($htmlcontent);
+                // extract <img>
+                $tags = $ddoc->getElementsByTagName('img');
+                foreach ($tags as $tag) {
+                    $url = $tag->getAttribute('src');
+                    $this->add_image_to_list($info['url'], $url, $list);
+                }
+                // analyse embedded css (<style>)
+                $tags = $ddoc->getElementsByTagName('style');
+                foreach ($tags as $tag) {
+                    if ($tag->getAttribute('type') == 'text/css') {
+                        $csstoanalyze .= $tag->textContent."\n";
+                    }
+                }
+                // analyse links to css (<link type='text/css' href='...'>)
+                $tags = $ddoc->getElementsByTagName('link');
+                foreach ($tags as $tag) {
+                    if ($tag->getAttribute('type') == 'text/css' && strlen($tag->getAttribute('href'))) {
+                        $this->parse_file($info['url'], $tag->getAttribute('href'), $list);
+                    }
+                }
+            } else if (strstr($info['content_type'], 'css')) {
+                // parse as css
+                $csscontent = $curl->get($info['url']);
+                $csstoanalyze .= $csscontent."\n";
+            } else if (strstr($info['content_type'], 'image/')) {
                 // download this file
-                $ret['list'][] = array(
-                    'title'=>$filename,
-                    'source'=>$this->file_url,
-                    'thumbnail' => $OUTPUT->pix_url(file_extension_icon($filename, 32))->out(false)
-                    );
+                $this->add_image_to_list($info['url'], $info['url'], $list);
+            }
+
+            // parse all found css styles
+            if (strlen($csstoanalyze)) {
+                $urls = extract_css_urls($csstoanalyze);
+                if (!empty($urls['property'])) {
+                    foreach ($urls['property'] as $url) {
+                        $this->add_image_to_list($info['url'], $url, $list);
+                    }
+                }
+                if (!empty($urls['import'])) {
+                    foreach ($urls['import'] as $cssurl) {
+                        $this->parse_file($info['url'], $cssurl, $list);
+                    }
+                }
             }
         }
-        return $ret;
     }
-    public function analyse_page($baseurl, $content, &$list) {
-        global $CFG, $OUTPUT;
-        $urls = extract_html_urls($content);
-        $images = $urls['img']['src'];
-        $pattern = '#img(.+)src="?\'?([[:alnum:]:?=&@/._+-]+)"?\'?#i';
-        if (!empty($images)) {
-            foreach($images as $url) {
-                $list['list'][] = array(
-                    'title'=>$this->guess_filename($url, ''),
-                    'source'=>url_to_absolute($baseurl, $url),
-                    'thumbnail'=>url_to_absolute($baseurl, $url),
-                    'thumbnail_height'=>84,
-                    'thumbnail_width'=>84
-                );
+    protected function add_image_to_list($baseurl, $url, &$list) {
+        if (empty($list['list'])) {
+            $list['list'] = array();
+        }
+        $src = url_to_absolute($baseurl, htmlspecialchars_decode($url));
+        foreach ($list['list'] as $image) {
+            if ($image['source'] == $src) {
+                return;
             }
         }
+        $list['list'][] = array(
+            'title'=>$this->guess_filename($url, ''),
+            'source'=>$src,
+            'thumbnail'=>$src,
+            'thumbnail_height'=>84,
+            'thumbnail_width'=>84
+        );
     }
     public function guess_filename($url, $type) {
         $pattern = '#\/([\w_\?\-.]+)$#';

diff --git a/repository/url/locallib.php b/repository/url/locallib.php
@@ -79,6 +79,9 @@ function url_to_absolute( $baseUrl, $relativeUrl )
 	if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
 		return FALSE;
 	$r['scheme'] = $b['scheme'];
+    if (empty($b['path'])) {
+        $b['path'] = '';
+    }
 
 	// If relative URL has an authority, clean path and return.
 	if ( isset( $r['host'] ) )
@@ -248,11 +251,11 @@ function url_remove_dot_segments( $path )
  * 	the associative array of URL parts, or FALSE if the URL is
  * 	too malformed to recognize any parts.
  */
-function split_url( $url, $decode=TRUE )
+function split_url( $url, $decode=FALSE)
 {
 	// Character sets from RFC3986.
 	$xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
-	$xpchar        = $xunressub . ':@%';
+	$xpchar        = $xunressub . ':@% ';
 
 	// Scheme from RFC3986.
 	$xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';
@@ -382,7 +385,7 @@ function split_url( $url, $decode=TRUE )
  * 	empty string is returned if the $parts array does not contain
  * 	any of the needed values.
  */
-function join_url( $parts, $encode=TRUE )
+function join_url( $parts, $encode=FALSE)
 {
 	if ( $encode )
 	{
@@ -432,6 +435,51 @@ function join_url( $parts, $encode=TRUE )
 		$url .= '#' . $parts['fragment'];
 	return $url;
 }
+
+/**
+ * This function encodes URL to form a URL which is properly
+ * percent encoded to replace disallowed characters.
+ *
+ * RFC3986 specifies the allowed characters in the URL as well as
+ * reserved characters in the URL. This function replaces all the
+ * disallowed characters in the URL with their repective percent
+ * encodings. Already encoded characters are not encoded again,
+ * such as '%20' is not encoded to '%2520'.
+ *
+ * Parameters:
+ * 	url		the url to encode.
+ *
+ * Return values:
+ * 	Returns the encoded URL string.
+ */
+function encode_url($url) {
+  $reserved = array(
+    ":" => '!%3A!ui',
+    "/" => '!%2F!ui',
+    "?" => '!%3F!ui',
+    "#" => '!%23!ui',
+    "[" => '!%5B!ui',
+    "]" => '!%5D!ui',
+    "@" => '!%40!ui',
+    "!" => '!%21!ui',
+    "$" => '!%24!ui',
+    "&" => '!%26!ui',
+    "'" => '!%27!ui',
+    "(" => '!%28!ui',
+    ")" => '!%29!ui',
+    "*" => '!%2A!ui',
+    "+" => '!%2B!ui',
+    "," => '!%2C!ui',
+    ";" => '!%3B!ui',
+    "=" => '!%3D!ui',
+    "%" => '!%25!ui',
+  );
+
+  $url = rawurlencode($url);
+  $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
+  return $url;
+}
+
 /**
  * Extract URLs from a web page.
  *